Question: Lab: Create a topical crawler in R Due: Submit your R code on Moodle along with a screenshot of your results. Requirements: Follow the basic

Lab: Create a topical crawler in R Due: Submit your R code on Moodle along with a screenshot of your results.

Requirements:

Follow the basic crawling algorithm provided in the slides Crawl 50 pages for your repository Only store websites which contain have at least one term in the body text from a list of keywords chosen by your group Store the following information in a character vector:

Error checking requirements: If the link in the frontier is a link to a jpg, go to the next item in the frontier If the retrieved page is less than 10 characters, go to the next item in the frontier Check for relative/absolute paths when adding to the frontier You may come across other implementation challenges during testing

Hints: Packages that will be useful: RCurl, XML, stringr, httr

getURL call: doc <- tryCatch(getURL(exploredlink),error=function(cond){return("")})

get the title: titleText <- xmlToDataFrame(nodes = getNodeSet(doc, "//title")) titleText <- as.vector(titleText$text) titleText <- unique(titleText)

Retreives the body text from a page: bodyText<-tryCatch(htmlToText(content(GET(exploredlink),type="text/html",as="text")),error=function(cond){return("")})

Parses words into a vector: bodyText<-str_split(tolower(str_replace_all((str_replace_all(bodyText,"(\\t|\ |\ )"," ")),"\\s{2,}"," "))," ")[[1]]

Parsing links from a page: anchor <- getNodeSet(doc, "//a") anchor <- sapply(anchor, function(x) xmlGetAttr(x, "href"))

any() operator will check for true values in a logical vector

x %in% y will check for x membership in y

SAMPLE CODE FOR USE :

#Write a topical crawler using the information provided below:

##Start your code with these libraries: library(RCurl) library(XML) library(stringr) library(httr)

htmlToText <- function(input, ...) { ###---PACKAGES ---### require(RCurl) require(XML) ###--- LOCAL FUNCTIONS ---### # Determine how to grab html for a single input element evaluate_input <- function(input) { # if input is a .html file if(file.exists(input)) { char.vec <- readLines(input, warn = FALSE) return(paste(char.vec, collapse = "")) } # if input is html text if(grepl("", input, fixed = TRUE)) return(input) # if input is a URL, probably should use a regex here instead? if(!grepl(" ", input)) { # downolad SSL certificate in case of https problem if(!file.exists("cacert.perm")) download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.perm") return(getURL(input, followlocation = TRUE, cainfo = "cacert.perm")) } # return NULL if none of the conditions above apply return(NULL) } # convert HTML to plain text convert_html_to_text <- function(html) { doc <- htmlParse(html, asText = TRUE) text <- xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", xmlValue) return(text) } # format text vector into one character string collapse_text <- function(txt) { return(paste(txt, collapse = " ")) } ###--- MAIN ---### # STEP 1: Evaluate input html.list <- lapply(input, evaluate_input) # STEP 2: Extract text from HTML text.list <- lapply(html.list, convert_html_to_text) # STEP 3: Return text text.vector <- sapply(text.list, collapse_text) return(text.vector) }

###Run the function code for htmlToText()(Be sure this function is listed in your Environment)

###Load the first element in the frontier to an "exploredlink" variable

frontier <- c("http://www.cnn.com","http://www.kdnuggets.com","http://news.google.com")

topicwords<-c("technology","school","web","mining","news")

num <- 50 #total number of items to crawl result <- c() j <- 0 #number of items in the repository

while (j < num){

if(length(frontier)<1){ break } #grab the first item in the frontier and place in the "exploredlink" variable exploredlink<-frontier[1] frontier<-frontier[-1] if(str_detect(exploredlink,"\\.jpg$")) { next }

#fill in your code here }

############ USEFUL CODE SNIPPETS ########

#How to get HTML doc <- tryCatch(getURL(exploredlink),error=function(cond){return("")})

if(str_length(doc)<10){ next }

doc <- htmlParse(doc)

domain<-str_extract(exploredlink,pattern = ".*\\.com")

if(is.na(domain)){ next }

###

#How to get a title titleText <- tryCatch(xmlToDataFrame(nodes = getNodeSet(doc, "//title")),error=function(cond){return("")}) if(titleText==""){ next } titleText <- as.vector(titleText$text) titleText <- unique(titleText)

###

#How to get body text bodyText<- tryCatch(htmlToText(content(GET(exploredlink),type="text/html",as="text")),error=function(cond){return("")})

bodyText<-str_split(tolower(str_replace_all((str_replace_all(bodyText,"(\\t|\ |\ )"," ")),"\\s{2,}"," "))," ")[[1]]

###

#How to get links from a page anchor <- getNodeSet(doc, "//a") anchor <- sapply(anchor, function(x) xmlGetAttr(x, "href"))

if(length(anchor)>0){ temp <- c() for(i in 1:length(anchor)){ if(is.null(anchor[[i]])){ next } if(!str_detect(anchor[[i]][1],"^http")){ next } if(str_detect(anchor[[i]][1],domain)){ next } temp <- append(temp,str_trim(anchor[[i]][1])) } anchor <- temp rm(temp) frontier<-append(frontier,anchor) frontier <- unique(frontier) }

###

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!