Question: This is a PYTHON question. I do not know anyother language. Problem: Make a getContent function, that returns the text-content of a webpage. online the

This is a PYTHON question. I do not know anyother language.

Problem: Make a getContent function, that returns the text-content of a webpage. online the text no tags

from urllib.request import urlopen def getSource(url): 'returns the content of resource specified by url as a string' response = urlopen(url) html = response.read() return html.decode()

from html.parser import HTMLParser class LinkParser(HTMLParser): '''HTML doc parser that prints values of href attributes in anchor start tags'''

def handle_starttag(self, tag, attrs): 'print value of href attribute if any' if tag == 'a': # if anchor tag

# search for href attribute and print its value for attr in attrs: if attr[0] == 'href': print(attr[1])

from urllib.parse import urljoin from html.parser import HTMLParser class Collector(HTMLParser): 'collects hyperlink URLs into a list'

def __init__(self, url): 'initializes parser, the url, and a list' HTMLParser.__init__(self) self.url = url self.links = []

# Solution to practice problem 11.3 self.text = '' def handle_starttag(self, tag, attrs): 'collects hyperlink URLs in their absolute format' if tag == 'a': for attr in attrs: if attr[0] == 'href': # construct absolute URL absolute = urljoin(self.url, attr[1]) if absolute[:4] == 'http': # collect HTTP URLs self.links.append(absolute) # Solution to practice problem 11.3 def handle_data(self, data): 'collects and concatenates text data' self.text += data

def getLinks(self): 'returns hyperlinks URLs in their absolute format' return self.links

# Solution to practice problem 11.3 def getData(self): 'returns the concatenation of all text data' return self.text

from urllib.request import urlopen def analyze(url): '''prints the frequency of every word in web page url and prints and returns the list of http links, in absolute format, in it''' print(' Visiting', url) # for testing

# obtain links in the web page content = urlopen(url).read().decode() collector = Collector(url) collector.feed(content) urls = collector.getLinks() # get list of links

# compute word frequencies content = collector.getData() # get text data as a string freq = frequency(content)

# print the frequency of every text data word in web page print(' {:45} {:10} {:5}'.format('URL', 'word', 'count')) for word in freq: print('{:45} {:10} {:5}'.format(url, word, freq[word]))

# print the http links found in web page print(' {:45} {:10}'.format('URL', 'link')) for link in urls: print('{:45} {:10}'.format(url, link))

return urls

def crawl1(url): 'recursive web crawler that calls analyze() on every web page'

# analyze() returns a list of hyperlink URLs in web page url links = analyze(url)

# recursively continue crawl from every link in links for link in links: try: # try block because link may not be valid HTML file crawl1(link) except: # if an exception is thrown, pass # ignore and move on.

visited = set() # initialize visited to an empty set

def crawl2(url): '''a recursive web crawler that calls analyze() on every visited web page'''

# add url to set of visited pages global visited # while not necessary, warns the programmer visited.add(url)

# analyze() returns a list of hyperlink URLs in web page url links = analyze(url)

# recursively continue crawl from every link in links for link in links: # follow link only if not visited if link not in visited: try: crawl2(link) except: pass

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!