Question: Help with Python code: I am doing my paper for C996 programming in python. I was able to write the code (don't know if it
Help with Python code:
I am doing my paper for C996 programming in python. I was able to write the code (don't know if it is correct). I have to extracts links that point to other HTML pages and include them in cvs file. All links should be in absolute format.
Below are two different codes and none of them gives me correct answer (it should be 118 links).
Can you please advise me in what i need to add/correct
CODE 1:
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
from urllib.parse import urljoin
import requests
import csv
#Scrape website, specify the url, send the requests and catch the response
source='https://www.census.gov/programs-surveys/popest.html'
url_results=requests.get (source)
#Parse the HTML using BeautifulSoup to get the content from the website
soup=BeautifulSoup(url_results.content,'html.parser')
print (soup.prettify())
#Number of retreived URLs
all_links=soup.find_all('a')
for link in soup.find_all ('a', href=True):
href=link.get('href')
urls = []
for link in soup.find_all('a', href=True):
urls.append(link['href'])
#deduplicate links
urls_list=[]
for link in urls:
if link not in urls_list:
urls_list.append(link)
#printing deduplicated links
for link in urls_list:
print(link)
#Create a file to write to
with open("outputsep0903.csv", "w",newline="") as f:
cw = csv.writer(f,delimiter=" ")
cw.writerow(urls_list)
f.close()
CODE 2:
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
from urllib.parse import urljoin
import requests
import csv
#Scrape website, specify the url, send the requests and catch the response
source_url='https://www.census.gov/programs-surveys/popest.html'
url_results=requests.get (source_url)
#Parse the HTML using BeautifulSoup
soup=BeautifulSoup(url_results.content,'html.parser')
print (soup.prettify())
#Counting links, loop created to print out all tags, all links to html pages
all_links=soup.find_all('a')
for link in all_links:
link.get('href')
print ("All links:",len(all_links))
#Unique links, absolute path, deduplication
def unique_links (all_links, source_url):
output_links=set()
for link in all_links:
link=link.get ('href')
if link is None:
continue
if link.endswith ('/') or link.endswith('#'):
link=link[-1]
actual_url=urllib.parse.urljoin(source_url,link)
output_links.add (actual_url)
return output_links
output_links=unique_links(all_links,source_url)
print ('Number of not duplicated links:',len(output_links))
#Create a file to write to
with open("outputsep3.csv", "w",newline="") as f:
cw = csv.writer(f,delimiter=" ")
cw.writerow(output_links)
f.close()
Step by Step Solution
There are 3 Steps involved in it
Get step-by-step solutions from verified subject matter experts
