Question: IT LITERALLY WILL NOT KEEP AN INDENT HERE IT IS IN A DOC https://docs.google.com/document/d/1AzsBv1PeVX1vbJTmsi40NrWXHitNs3qQU61nl8pQjTo/edit?usp=sharing i need help allowing a text file of whatever choice to
IT LITERALLY WILL NOT KEEP AN INDENT
HERE IT IS IN A DOC
https://docs.google.com/document/d/1AzsBv1PeVX1vbJTmsi40NrWXHitNs3qQU61nl8pQjTo/edit?usp=sharing
i need help allowing a text file of whatever choice to be used in the robot.txt place(py.py) I also need help getting the bfs and dfs function to work in the py.py file(located in webcrawler1.py)
i need to be able to use this command line webcrawer.py seeds.txt 100 /directory BFS or DFS 100 Is for the nodes to crawl and I need it to put the source code of each page visited up to a certain number of pages in a drectory which is called directory up above. each pages source code must go into a different file and each file must be called something different so maybe adding a number after each name of the file. then i need help putting the urls in a text file of each page visited.
webcrawler1.py
import urllib
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
import collections
from Graph import Graph
from Node import Node
import sys
from time import gmtime, strftime
from timeout import timeout
from multiprocessing import Pool
from multiprocessing import Process
import json
import pdb
class WebCrawler:
def __init__(self, originUrl, method, totalNodes, depthLimit=None, keyword=None):
self.originUrl = originUrl
self.method = method
self.totalNodes = int(totalNodes)
self.nodeCount = 0
self.depthLimit = int(depthLimit)
self.currentDepth = 0
self.keyword = keyword
self.keywordUrls = []
self.nodeUrlMap = {}
self.nodesToVisit = []
self.visitedUrls = set()
self.graph = Graph()
self.nodeIndex = 0
self.storeCookie()
originTitle = self.getTitle(originUrl)
startNode = Node(originUrl, None, originTitle)
self.crawl(startNode)
def crawl(self, node):
print("crawl(): " + strftime("%H:%M:%S", gmtime()))
visited = node.url in self.visitedUrls
if not visited:
self.graph.addNode(node, self.nodeIndex)
self.nodeIndex += 1
self.nodeCount += 1
self.visitedUrls.add(node.url)
if node.sourceNodes: # If this is not the starting node
sourceNode = node.sourceNodes.pop()
if sourceNode.index is not None and node.index is not None:
self.graph.addEdge(sourceNode.index, node.index) # Add an edge between sourceNode and node
if not visited:
soup = self.generateSoup(node.url)
hasKeyword = self.checkForKeyword(soup, node.url)
if hasKeyword:
node.keyword = True
links = self.findLinks(soup)
links = self.validLinks(links)
links = {l for l in links} # Remove duplicate links
if links:
if self.method == "BFS":
self.bfs(node, links)
else: # DFS
self.currentDepth += 1
if self.currentDepth >= self.depthLimit: # If depth limit reached, getNextNode (up a level)
self.currentDepth = 0 # Reset currentDepth
self.getNextNode()
else: # Otherwise, keep going deeper
self.dfs(node, links)
else: # No links present
self.getNextNode()
else: # Avoid infinite loop
self.getNextNode()
def validLinks(self, links):
print("validLinks(): " + strftime("%H:%M:%S", gmtime()))
validLinks = []
for link in links:
# Only add links while there is still room
if self.nodeCount + len(validLinks) <= self.totalNodes:
if self.isValidUrl(link):
validLinks.append(link)
return validLinks
def isValidUrl(self, url):
print("isValidUrl(): " + strftime("%H:%M:%S", gmtime()))
extensionBlacklist = ["zip", "dmg", "msi", "tar", "exe", "sisx"]
for x in extensionBlacklist:
if x in url:
return False
if "http" not in url: return False
parsed_url = urlparse(url)
if not bool(parsed_url.scheme): return False
try:
self.testRequest(url)
except:
return False
return True
@timeout(3)
def testRequest(self, url):
requests.get(url)
def getNextNode(self):
print("getNextNode(): " + strftime("%H:%M:%S", gmtime()))
if len(self.nodesToVisit) is not 0 and not self.nodeLimitReached():
# We use the same data structure to store urlsToVisit for BFS and DFS,
# and pop elements off the same way. How the elements are added is
# what's important.
nextNode = self.nodesToVisit.pop()
self.crawl(nextNode)
else: # Crawl is over
self.printGraph()
def printGraph(self):
for node in self.graph.nodes:
print(" Node:")
if node.title:
print("Index: " + str(node.index))
print("Title: " + node.title)
print("URL: " + node.url)
print("Keyword: " + str(node.keyword))
if self.graph.edges:
print(" Edges:")
edgeCount = 0
for e in self.graph.edges:
print("Source: " + str(e.source) + " Target: " + str(e.target))
if self.keywordUrls:
print(" Keyword URLs:")
for k in self.keywordUrls:
print("URL: " + k)
print(" JSON:")
print(self.jsonSerialize())
def jsonSerialize(self):
for n in self.graph.nodes:
n.sourceNodes = []
self.graph.edges = list(self.graph.edges)
return json.dumps(self.graph, default=lambda o: o.__dict__)
def storeCookie(self):
# Store graph as cookie (do this one)
pass
def nodeLimitReached(self):
return self.nodeCount >= self.totalNodes
# Convert URL into soup
def generateSoup(self, url):
print("generateSoup(): " + strftime("%H:%M:%S", gmtime()))
sourceCode = requests.get(url)
plainText = sourceCode.text
soup = BeautifulSoup(plainText, "html.parser")
return soup
# Parse soup to find links
def findLinks(self, soup):
print("findLinks(): " + strftime("%H:%M:%S", gmtime()))
links = soup.findAll('a')
hrefs = []
for link in links:
href = link.get('href', '')
hrefs.append(href)
return hrefs
def getTitle(self, url):
print("getTitle(): " + strftime("%H:%M:%S", gmtime()))
soup = self.generateSoup(url)
titles = soup.findAll('title')
if titles:
title = str(titles[0]).replace("
", "")
title = title.replace("", "")
return title
def bfs(self, currentNode, links):
print("bfs(): " + strftime("%H:%M:%S", gmtime()))
for link in links:
# If url is not already visited, and nodesToVisit+nodeCount hasn't exceeded totalNodes
if link not in self.visitedUrls and self.nodeCount + len(self.nodesToVisit) <= self.totalNodes:
title = self.getTitle(link)
newNode = Node(link, [currentNode], title)
newNode.sourceNodes.insert(0, currentNode)
self.nodesToVisit.insert(0, newNode)
self.nodeUrlMap[link] = newNode
elif link in self.nodeUrlMap: # Repeat URL, get existing node
existingNode = self.nodeUrlMap[link]
existingNode.sourceNodes.insert(0, currentNode)
self.nodesToVisit.insert(0, existingNode)
self.getNextNode()
def dfs(self, currentNode, links):
print("dfs(): " + strftime("%H:%M:%S", gmtime()))
for link in links:
if link not in self.visitedUrls:
title = self.getTitle(link)
newNode = Node(link, [currentNode], title)
newNode.sourceNodes.append(currentNode)
self.nodesToVisit.append(newNode)
elif link in self.nodeUrlMap: # Repeat URL, get existing node
existingNode = self.nodeUrlMap[link]
existingNode.sourceNodes.append(currentNode)
self.nodesToVisit.append(existingNode)
self.getNextNode()
def checkForKeyword(self, soup, url):
# If keyword found in soup, append url to keywordUrls
if soup.body and soup.body.findAll(text=self.keyword):
self.keywordUrls.append(url)
return True
if __name__ == '__main__':
webCrawler = WebCrawler(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
//
py.py
"""
implement a simple web crawler
Usage: crawler.py seed_url
seed: absolute url - the crawler will use it as the initial web address
"""
import urllib.request
import urllib.parse
import urllib.error
import urllib.robotparser
import re
import sys
# DO NOT CHANGE ok_to_crawl!!!
def ok_to_crawl(absolute_url):
"""
check if it is OK to crawl the specified absolute url
We are implementing polite crawling by checking the robots.txt file
for all urls except the ones using the file scheme (these are urls
on the local host and they are all OK to crawl.)
We also use this function to skip over mailto: links and javascript: links.
Parameter:
absolute_url (string): this is an absolute url that we would like to crawl
Returns:
boolean: True if the scheme is file (it is a local webpage)
True if we successfully read the corresponding robots.txt
file and determined that user-agent * is allowed to crawl
False if it is a mailto: link or a javascript: link
if user-agent * is not allowed to crawl it or
if it is NOT an absolute url.
"""
if absolute_url.lower().startswith('mailto:'):
return False
if absolute_url.lower().startswith('javascript:'):
return False
link_obj=urllib.parse.urlparse(absolute_url)
if link_obj.scheme.lower().startswith('file'):
return True
# check if the url given as input is an absolute url
if not link_obj.scheme or not link_obj.hostname:
print('Not a valid absolute url: ', absolute_url)
return False
#construct the robots.txt url from the scheme and host name
else:
robot_url= link_obj.scheme+'://'+link_obj.hostname + '/robots.txt'
rp = urllib.robotparser.RobotFileParser()
rp.set_url(robot_url)
try:
rp.read()
except:
print ("Error accessing robot file: ", robot_url)
return False
else:
return rp.can_fetch("*", absolute_url)
# DO NOT CHANGE crawl!!!
def crawl(seed_url):
"""
start with the seed_url and crawl up to 10 urls
Parameter:
seed_url (string) - this is the first url we'll visit.
Returns:
set of strings - set of all the urls we have visited.
"""
urls_tocrawl = {seed_url} # initialize our set of urls to crawl
urls_visited = set() # initialize our set of urls visited
while urls_tocrawl and len(urls_visited) < 10:
current_url = urls_tocrawl.pop() # just get any url from the set
if current_url not in urls_visited: # check if we have crawled it before
page = get_page(current_url)
if page:
more_urls = extract_links(current_url, page) # get the links
urls_tocrawl = urls_tocrawl | more_urls # add them to be crawled
urls_visited.add(current_url)
return urls_visited
#------------Do not change anything above this line-----------------------------
def get_page(url):
"""
generate a web page of html in string from url
params: absolute url as string
return: if there is URLError or DecodeError, return an empty string
else return the full html page content as string
"""
try:
with urllib.request.urlopen(url) as url_file:
page_string = url_file.read().decode('UTF-8')
return page_string
except urllib.error.URLError as url_err:
print("Error opening url: ", url, url_err)
return " "
except UnicodeDecodeError as decode_err:
print("Error decoding url", url, decode_err)
return " "
def extract_links(base_url, page):
"""
extract the links contained in the page at the base_url
Parameters:
base_url (string): the url we are currently crawling - web address
page(string): the content of that url - html
Returns:
A set of absolute urls (set of strings) - These are all the urls extracted
from the current url and converted to absolute urls.
"""
urls_set = set()
page_links = re.findall('
# Convert each link to an absolute url
urllib.parse.urljoin(base_url, link)
urls_set.add(link)
def main():
if len(seed_url) != 2:
if len(seed_url) == 2:
with open('crawled.txt', 'w', encoding='utf-8') as new_file:
new_file.write(url + " ")
main()
Step by Step Solution
There are 3 Steps involved in it
Get step-by-step solutions from verified subject matter experts
