Question: IT LITERALLY WILL NOT KEEP AN INDENT HERE IT IS IN A DOC https://docs.google.com/document/d/1AzsBv1PeVX1vbJTmsi40NrWXHitNs3qQU61nl8pQjTo/edit?usp=sharing i need help allowing a text file of whatever choice to

IT LITERALLY WILL NOT KEEP AN INDENT

HERE IT IS IN A DOC

https://docs.google.com/document/d/1AzsBv1PeVX1vbJTmsi40NrWXHitNs3qQU61nl8pQjTo/edit?usp=sharing

i need help allowing a text file of whatever choice to be used in the robot.txt place(py.py) I also need help getting the bfs and dfs function to work in the py.py file(located in webcrawler1.py)

i need to be able to use this command line webcrawer.py seeds.txt 100 /directory BFS or DFS 100 Is for the nodes to crawl and I need it to put the source code of each page visited up to a certain number of pages in a drectory which is called directory up above. each pages source code must go into a different file and each file must be called something different so maybe adding a number after each name of the file. then i need help putting the urls in a text file of each page visited.

webcrawler1.py

import urllib

from urllib.request import urlopen

from urllib.parse import urlparse

from bs4 import BeautifulSoup

import requests

import collections

from Graph import Graph

from Node import Node

import sys

from time import gmtime, strftime

from timeout import timeout

from multiprocessing import Pool

from multiprocessing import Process

import json

import pdb

class WebCrawler:

def __init__(self, originUrl, method, totalNodes, depthLimit=None, keyword=None):

self.originUrl = originUrl

self.method = method

self.totalNodes = int(totalNodes)

self.nodeCount = 0

self.depthLimit = int(depthLimit)

self.currentDepth = 0

self.keyword = keyword

self.keywordUrls = []

self.nodeUrlMap = {}

self.nodesToVisit = []

self.visitedUrls = set()

self.graph = Graph()

self.nodeIndex = 0

self.storeCookie()

originTitle = self.getTitle(originUrl)

startNode = Node(originUrl, None, originTitle)

self.crawl(startNode)

def crawl(self, node):

print("crawl(): " + strftime("%H:%M:%S", gmtime()))

visited = node.url in self.visitedUrls

if not visited:

self.graph.addNode(node, self.nodeIndex)

self.nodeIndex += 1

self.nodeCount += 1

self.visitedUrls.add(node.url)

if node.sourceNodes: # If this is not the starting node

sourceNode = node.sourceNodes.pop()

if sourceNode.index is not None and node.index is not None:

self.graph.addEdge(sourceNode.index, node.index) # Add an edge between sourceNode and node

if not visited:

soup = self.generateSoup(node.url)

hasKeyword = self.checkForKeyword(soup, node.url)

if hasKeyword:

node.keyword = True

links = self.findLinks(soup)

links = self.validLinks(links)

links = {l for l in links} # Remove duplicate links

if links:

if self.method == "BFS":

self.bfs(node, links)

else: # DFS

self.currentDepth += 1

if self.currentDepth >= self.depthLimit: # If depth limit reached, getNextNode (up a level)

self.currentDepth = 0 # Reset currentDepth

self.getNextNode()

else: # Otherwise, keep going deeper

self.dfs(node, links)

else: # No links present

self.getNextNode()

else: # Avoid infinite loop

self.getNextNode()

def validLinks(self, links):

print("validLinks(): " + strftime("%H:%M:%S", gmtime()))

validLinks = []

for link in links:

# Only add links while there is still room

if self.nodeCount + len(validLinks) <= self.totalNodes:

if self.isValidUrl(link):

validLinks.append(link)

return validLinks

def isValidUrl(self, url):

print("isValidUrl(): " + strftime("%H:%M:%S", gmtime()))

extensionBlacklist = ["zip", "dmg", "msi", "tar", "exe", "sisx"]

for x in extensionBlacklist:

if x in url:

return False

if "http" not in url: return False

parsed_url = urlparse(url)

if not bool(parsed_url.scheme): return False

try:

self.testRequest(url)

except:

return False

return True

@timeout(3)

def testRequest(self, url):

requests.get(url)

def getNextNode(self):

print("getNextNode(): " + strftime("%H:%M:%S", gmtime()))

if len(self.nodesToVisit) is not 0 and not self.nodeLimitReached():

# We use the same data structure to store urlsToVisit for BFS and DFS,

# and pop elements off the same way. How the elements are added is

# what's important.

nextNode = self.nodesToVisit.pop()

self.crawl(nextNode)

else: # Crawl is over

self.printGraph()

def printGraph(self):

for node in self.graph.nodes:

print(" Node:")

if node.title:

print("Index: " + str(node.index))

print("Title: " + node.title)

print("URL: " + node.url)

print("Keyword: " + str(node.keyword))

if self.graph.edges:

print(" Edges:")

edgeCount = 0

for e in self.graph.edges:

print("Source: " + str(e.source) + " Target: " + str(e.target))

if self.keywordUrls:

print(" Keyword URLs:")

for k in self.keywordUrls:

print("URL: " + k)

print(" JSON:")

print(self.jsonSerialize())

def jsonSerialize(self):

for n in self.graph.nodes:

n.sourceNodes = []

self.graph.edges = list(self.graph.edges)

return json.dumps(self.graph, default=lambda o: o.__dict__)

def storeCookie(self):

# Store graph as cookie (do this one)

pass

def nodeLimitReached(self):

return self.nodeCount >= self.totalNodes

# Convert URL into soup

def generateSoup(self, url):

print("generateSoup(): " + strftime("%H:%M:%S", gmtime()))

sourceCode = requests.get(url)

plainText = sourceCode.text

soup = BeautifulSoup(plainText, "html.parser")

return soup

# Parse soup to find links

def findLinks(self, soup):

print("findLinks(): " + strftime("%H:%M:%S", gmtime()))

links = soup.findAll('a')

hrefs = []

for link in links:

href = link.get('href', '')

hrefs.append(href)

return hrefs

def getTitle(self, url):

print("getTitle(): " + strftime("%H:%M:%S", gmtime()))

soup = self.generateSoup(url)

titles = soup.findAll('title')

if titles:

title = str(titles[0]).replace("

", "")

title = title.replace("", "")

return title

def bfs(self, currentNode, links):

print("bfs(): " + strftime("%H:%M:%S", gmtime()))

for link in links:

# If url is not already visited, and nodesToVisit+nodeCount hasn't exceeded totalNodes

if link not in self.visitedUrls and self.nodeCount + len(self.nodesToVisit) <= self.totalNodes:

title = self.getTitle(link)

newNode = Node(link, [currentNode], title)

newNode.sourceNodes.insert(0, currentNode)

self.nodesToVisit.insert(0, newNode)

self.nodeUrlMap[link] = newNode

elif link in self.nodeUrlMap: # Repeat URL, get existing node

existingNode = self.nodeUrlMap[link]

existingNode.sourceNodes.insert(0, currentNode)

self.nodesToVisit.insert(0, existingNode)

self.getNextNode()

def dfs(self, currentNode, links):

print("dfs(): " + strftime("%H:%M:%S", gmtime()))

for link in links:

if link not in self.visitedUrls:

title = self.getTitle(link)

newNode = Node(link, [currentNode], title)

newNode.sourceNodes.append(currentNode)

self.nodesToVisit.append(newNode)

elif link in self.nodeUrlMap: # Repeat URL, get existing node

existingNode = self.nodeUrlMap[link]

existingNode.sourceNodes.append(currentNode)

self.nodesToVisit.append(existingNode)

self.getNextNode()

def checkForKeyword(self, soup, url):

# If keyword found in soup, append url to keywordUrls

if soup.body and soup.body.findAll(text=self.keyword):

self.keywordUrls.append(url)

return True

if __name__ == '__main__':

webCrawler = WebCrawler(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])

py.py

"""

implement a simple web crawler

Usage: crawler.py seed_url

seed: absolute url - the crawler will use it as the initial web address

"""

import urllib.request

import urllib.parse

import urllib.error

import urllib.robotparser

import re

import sys

# DO NOT CHANGE ok_to_crawl!!!

def ok_to_crawl(absolute_url):

"""

check if it is OK to crawl the specified absolute url

We are implementing polite crawling by checking the robots.txt file

for all urls except the ones using the file scheme (these are urls

on the local host and they are all OK to crawl.)

We also use this function to skip over mailto: links and javascript: links.

Parameter:

absolute_url (string): this is an absolute url that we would like to crawl

Returns:

boolean: True if the scheme is file (it is a local webpage)

True if we successfully read the corresponding robots.txt

file and determined that user-agent * is allowed to crawl

False if it is a mailto: link or a javascript: link

if user-agent * is not allowed to crawl it or

if it is NOT an absolute url.

"""

if absolute_url.lower().startswith('mailto:'):

return False

if absolute_url.lower().startswith('javascript:'):

return False

link_obj=urllib.parse.urlparse(absolute_url)

if link_obj.scheme.lower().startswith('file'):

return True

# check if the url given as input is an absolute url

if not link_obj.scheme or not link_obj.hostname:

print('Not a valid absolute url: ', absolute_url)

return False

#construct the robots.txt url from the scheme and host name

else:

robot_url= link_obj.scheme+'://'+link_obj.hostname + '/robots.txt'

rp = urllib.robotparser.RobotFileParser()

rp.set_url(robot_url)

try:

rp.read()

except:

print ("Error accessing robot file: ", robot_url)

return False

else:

return rp.can_fetch("*", absolute_url)

# DO NOT CHANGE crawl!!!

def crawl(seed_url):

"""

start with the seed_url and crawl up to 10 urls

Parameter:

seed_url (string) - this is the first url we'll visit.

Returns:

set of strings - set of all the urls we have visited.

"""

urls_tocrawl = {seed_url} # initialize our set of urls to crawl

urls_visited = set() # initialize our set of urls visited

while urls_tocrawl and len(urls_visited) < 10:

current_url = urls_tocrawl.pop() # just get any url from the set

if current_url not in urls_visited: # check if we have crawled it before

page = get_page(current_url)

if page:

more_urls = extract_links(current_url, page) # get the links

urls_tocrawl = urls_tocrawl | more_urls # add them to be crawled

urls_visited.add(current_url)

return urls_visited

#------------Do not change anything above this line-----------------------------

def get_page(url):

"""

generate a web page of html in string from url

params: absolute url as string

return: if there is URLError or DecodeError, return an empty string

else return the full html page content as string

"""

try:

with urllib.request.urlopen(url) as url_file:

page_string = url_file.read().decode('UTF-8')

return page_string

except urllib.error.URLError as url_err:

print("Error opening url: ", url, url_err)

return " "

except UnicodeDecodeError as decode_err:

print("Error decoding url", url, decode_err)

return " "

def extract_links(base_url, page):

"""

extract the links contained in the page at the base_url

Parameters:

base_url (string): the url we are currently crawling - web address

page(string): the content of that url - html

Returns:

A set of absolute urls (set of strings) - These are all the urls extracted

from the current url and converted to absolute urls.

"""

urls_set = set()

page_links = re.findall('

# Convert each link to an absolute url

urllib.parse.urljoin(base_url, link)

urls_set.add(link)

def main():

if len(seed_url) != 2:

if len(seed_url) == 2:

with open('crawled.txt', 'w', encoding='utf-8') as new_file:

new_file.write(url + " ")

main()

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer

Step: 1 Unlock blur-text-image

Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock

Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!

I need help with this essay. Performing tax research to find correct answers to a given tax situation and composing memoranda summarizing these findings are important parts of tax practice. As...

Background Information This assignment tests your understanding of and ability to apply the programming concepts we have covered throughout the unit. The concepts covered in the second half of the...

HI this is a Marketing section that is part of a business plan I just need this part put into power point for me . I have the flu and don't know to much about Power Point can someone please help me....

Module 9 Assignment: TOC Answer all the questions and submit your answer report to Module 9 Assignment in Dropbox by the deadline . The report should be typed, single spaced, in one MS Word file. You...

The following case study is connected to the questions below: Please assist with answering the following questions in a synopsis format related to case study: Brief Overview of Client DSM Diagnosis...

Rev.Confirming Pages C H A P T E R 7 Planning, Composing, and Revising Chapter Outline The Ways Good Writers Write Activities in the Composing Process Using Your Time Effectively Brainstorming,...

Please find the assignment on page 30 of the attached file(unit outline), which is: Task 2 - Research Focused Project - 25% I need $4000 words for this research paper. Please Use Academic journal...

some that this person, who acts on behalf of the organi- , will act ethically because of the organizational con ; placed upon his or her individual behavior. rere is empirical evidence to support the...

Evaluate three (3) of the strategies that you might recommend if the Strategic Position and Action Evaluation (SPACE) Matrix directional vector points to the lower left quadrant.

Suppose it were possible to measure and track economic income and the true economic value of a firms assets. Would there be any remaining need for EVA? Discuss.

3 Copy and complete each subtraction. a 9 11 24 18 11 b 31-1/4 00 S 8 4 4 4 8 900 8 4 8 C 533-2300 d 6-200 00 28 5 10 56 3 6 10 10 10 666 =3 4. 10 10 6

On 1 March 2007 DB Limited issued R560 000 15% debentures at R98. The debentures were to be redeemed at par in four equal annual payments starting 28 February 2010. Required: Journalise the above...