Question: IT LITERALLY WILL NOT KEEP AN INDENT HERE IT IS IN A DOC https://docs.google.com/document/d/1AzsBv1PeVX1vbJTmsi40NrWXHitNs3qQU61nl8pQjTo/edit?usp=sharing i need help allowing a text file of whatever choice to

IT LITERALLY WILL NOT KEEP AN INDENT

HERE IT IS IN A DOC

https://docs.google.com/document/d/1AzsBv1PeVX1vbJTmsi40NrWXHitNs3qQU61nl8pQjTo/edit?usp=sharing

i need help allowing a text file of whatever choice to be used in the robot.txt place(py.py) I also need help getting the bfs and dfs function to work in the py.py file(located in webcrawler1.py)

i need to be able to use this command line webcrawer.py seeds.txt 100 /directory BFS or DFS 100 Is for the nodes to crawl and I need it to put the source code of each page visited up to a certain number of pages in a drectory which is called directory up above. each pages source code must go into a different file and each file must be called something different so maybe adding a number after each name of the file. then i need help putting the urls in a text file of each page visited.

webcrawler1.py

import urllib

from urllib.request import urlopen

from urllib.parse import urlparse

from bs4 import BeautifulSoup

import requests

import collections

from Graph import Graph

from Node import Node

import sys

from time import gmtime, strftime

from timeout import timeout

from multiprocessing import Pool

from multiprocessing import Process

import json

import pdb

class WebCrawler:

def __init__(self, originUrl, method, totalNodes, depthLimit=None, keyword=None):

self.originUrl = originUrl

self.method = method

self.totalNodes = int(totalNodes)

self.nodeCount = 0

self.depthLimit = int(depthLimit)

self.currentDepth = 0

self.keyword = keyword

self.keywordUrls = []

self.nodeUrlMap = {}

self.nodesToVisit = []

self.visitedUrls = set()

self.graph = Graph()

self.nodeIndex = 0

self.storeCookie()

originTitle = self.getTitle(originUrl)

startNode = Node(originUrl, None, originTitle)

self.crawl(startNode)

def crawl(self, node):

print("crawl(): " + strftime("%H:%M:%S", gmtime()))

visited = node.url in self.visitedUrls

if not visited:

self.graph.addNode(node, self.nodeIndex)

self.nodeIndex += 1

self.nodeCount += 1

self.visitedUrls.add(node.url)

if node.sourceNodes: # If this is not the starting node

sourceNode = node.sourceNodes.pop()

if sourceNode.index is not None and node.index is not None:

self.graph.addEdge(sourceNode.index, node.index) # Add an edge between sourceNode and node

if not visited:

soup = self.generateSoup(node.url)

hasKeyword = self.checkForKeyword(soup, node.url)

if hasKeyword:

node.keyword = True

links = self.findLinks(soup)

links = self.validLinks(links)

links = {l for l in links} # Remove duplicate links

if links:

if self.method == "BFS":

self.bfs(node, links)

else: # DFS

self.currentDepth += 1

if self.currentDepth >= self.depthLimit: # If depth limit reached, getNextNode (up a level)

self.currentDepth = 0 # Reset currentDepth

self.getNextNode()

else: # Otherwise, keep going deeper

self.dfs(node, links)

else: # No links present

self.getNextNode()

else: # Avoid infinite loop

self.getNextNode()

def validLinks(self, links):

print("validLinks(): " + strftime("%H:%M:%S", gmtime()))

validLinks = []

for link in links:

# Only add links while there is still room

if self.nodeCount + len(validLinks) <= self.totalNodes:

if self.isValidUrl(link):

validLinks.append(link)

return validLinks

def isValidUrl(self, url):

print("isValidUrl(): " + strftime("%H:%M:%S", gmtime()))

extensionBlacklist = ["zip", "dmg", "msi", "tar", "exe", "sisx"]

for x in extensionBlacklist:

if x in url:

return False

if "http" not in url: return False

parsed_url = urlparse(url)

if not bool(parsed_url.scheme): return False

try:

self.testRequest(url)

except:

return False

return True

@timeout(3)

def testRequest(self, url):

requests.get(url)

def getNextNode(self):

print("getNextNode(): " + strftime("%H:%M:%S", gmtime()))

if len(self.nodesToVisit) is not 0 and not self.nodeLimitReached():

# We use the same data structure to store urlsToVisit for BFS and DFS,

# and pop elements off the same way. How the elements are added is

# what's important.

nextNode = self.nodesToVisit.pop()

self.crawl(nextNode)

else: # Crawl is over

self.printGraph()

def printGraph(self):

for node in self.graph.nodes:

print(" Node:")

if node.title:

print("Index: " + str(node.index))

print("Title: " + node.title)

print("URL: " + node.url)

print("Keyword: " + str(node.keyword))

if self.graph.edges:

print(" Edges:")

edgeCount = 0

for e in self.graph.edges:

print("Source: " + str(e.source) + " Target: " + str(e.target))

if self.keywordUrls:

print(" Keyword URLs:")

for k in self.keywordUrls:

print("URL: " + k)

print(" JSON:")

print(self.jsonSerialize())

def jsonSerialize(self):

for n in self.graph.nodes:

n.sourceNodes = []

self.graph.edges = list(self.graph.edges)

return json.dumps(self.graph, default=lambda o: o.__dict__)

def storeCookie(self):

# Store graph as cookie (do this one)

pass

def nodeLimitReached(self):

return self.nodeCount >= self.totalNodes

# Convert URL into soup

def generateSoup(self, url):

print("generateSoup(): " + strftime("%H:%M:%S", gmtime()))

sourceCode = requests.get(url)

plainText = sourceCode.text

soup = BeautifulSoup(plainText, "html.parser")

return soup

# Parse soup to find links

def findLinks(self, soup):

print("findLinks(): " + strftime("%H:%M:%S", gmtime()))

links = soup.findAll('a')

hrefs = []

for link in links:

href = link.get('href', '')

hrefs.append(href)

return hrefs

def getTitle(self, url):

print("getTitle(): " + strftime("%H:%M:%S", gmtime()))

soup = self.generateSoup(url)

titles = soup.findAll('title')

if titles:

title = str(titles[0]).replace("

", "")

title = title.replace("", "")

return title

def bfs(self, currentNode, links):

print("bfs(): " + strftime("%H:%M:%S", gmtime()))

for link in links:

# If url is not already visited, and nodesToVisit+nodeCount hasn't exceeded totalNodes

if link not in self.visitedUrls and self.nodeCount + len(self.nodesToVisit) <= self.totalNodes:

title = self.getTitle(link)

newNode = Node(link, [currentNode], title)

newNode.sourceNodes.insert(0, currentNode)

self.nodesToVisit.insert(0, newNode)

self.nodeUrlMap[link] = newNode

elif link in self.nodeUrlMap: # Repeat URL, get existing node

existingNode = self.nodeUrlMap[link]

existingNode.sourceNodes.insert(0, currentNode)

self.nodesToVisit.insert(0, existingNode)

self.getNextNode()

def dfs(self, currentNode, links):

print("dfs(): " + strftime("%H:%M:%S", gmtime()))

for link in links:

if link not in self.visitedUrls:

title = self.getTitle(link)

newNode = Node(link, [currentNode], title)

newNode.sourceNodes.append(currentNode)

self.nodesToVisit.append(newNode)

elif link in self.nodeUrlMap: # Repeat URL, get existing node

existingNode = self.nodeUrlMap[link]

existingNode.sourceNodes.append(currentNode)

self.nodesToVisit.append(existingNode)

self.getNextNode()

def checkForKeyword(self, soup, url):

# If keyword found in soup, append url to keywordUrls

if soup.body and soup.body.findAll(text=self.keyword):

self.keywordUrls.append(url)

return True

if __name__ == '__main__':

webCrawler = WebCrawler(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])

//

py.py

"""

implement a simple web crawler

Usage: crawler.py seed_url

seed: absolute url - the crawler will use it as the initial web address

"""

import urllib.request

import urllib.parse

import urllib.error

import urllib.robotparser

import re

import sys

# DO NOT CHANGE ok_to_crawl!!!

def ok_to_crawl(absolute_url):

"""

check if it is OK to crawl the specified absolute url

We are implementing polite crawling by checking the robots.txt file

for all urls except the ones using the file scheme (these are urls

on the local host and they are all OK to crawl.)

We also use this function to skip over mailto: links and javascript: links.

Parameter:

absolute_url (string): this is an absolute url that we would like to crawl

Returns:

boolean: True if the scheme is file (it is a local webpage)

True if we successfully read the corresponding robots.txt

file and determined that user-agent * is allowed to crawl

False if it is a mailto: link or a javascript: link

if user-agent * is not allowed to crawl it or

if it is NOT an absolute url.

"""

if absolute_url.lower().startswith('mailto:'):

return False

if absolute_url.lower().startswith('javascript:'):

return False

link_obj=urllib.parse.urlparse(absolute_url)

if link_obj.scheme.lower().startswith('file'):

return True

# check if the url given as input is an absolute url

if not link_obj.scheme or not link_obj.hostname:

print('Not a valid absolute url: ', absolute_url)

return False

#construct the robots.txt url from the scheme and host name

else:

robot_url= link_obj.scheme+'://'+link_obj.hostname + '/robots.txt'

rp = urllib.robotparser.RobotFileParser()

rp.set_url(robot_url)

try:

rp.read()

except:

print ("Error accessing robot file: ", robot_url)

return False

else:

return rp.can_fetch("*", absolute_url)

# DO NOT CHANGE crawl!!!

def crawl(seed_url):

"""

start with the seed_url and crawl up to 10 urls

Parameter:

seed_url (string) - this is the first url we'll visit.

Returns:

set of strings - set of all the urls we have visited.

"""

urls_tocrawl = {seed_url} # initialize our set of urls to crawl

urls_visited = set() # initialize our set of urls visited

while urls_tocrawl and len(urls_visited) < 10:

current_url = urls_tocrawl.pop() # just get any url from the set

if current_url not in urls_visited: # check if we have crawled it before

page = get_page(current_url)

if page:

more_urls = extract_links(current_url, page) # get the links

urls_tocrawl = urls_tocrawl | more_urls # add them to be crawled

urls_visited.add(current_url)

return urls_visited

#------------Do not change anything above this line-----------------------------

def get_page(url):

"""

generate a web page of html in string from url

params: absolute url as string

return: if there is URLError or DecodeError, return an empty string

else return the full html page content as string

"""

try:

with urllib.request.urlopen(url) as url_file:

page_string = url_file.read().decode('UTF-8')

return page_string

except urllib.error.URLError as url_err:

print("Error opening url: ", url, url_err)

return " "

except UnicodeDecodeError as decode_err:

print("Error decoding url", url, decode_err)

return " "

def extract_links(base_url, page):

"""

extract the links contained in the page at the base_url

Parameters:

base_url (string): the url we are currently crawling - web address

page(string): the content of that url - html

Returns:

A set of absolute urls (set of strings) - These are all the urls extracted

from the current url and converted to absolute urls.

"""

urls_set = set()

page_links = re.findall('

# Convert each link to an absolute url

urllib.parse.urljoin(base_url, link)

urls_set.add(link)

def main():

if len(seed_url) != 2:

if len(seed_url) == 2:

with open('crawled.txt', 'w', encoding='utf-8') as new_file:

new_file.write(url + " ")

main()

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!