Question: Help with Python code: I am doing my paper for C996 programming in python. I was able to write the code (don't know if it

Help with Python code:

I am doing my paper for C996 programming in python. I was able to write the code (don't know if it is correct). I have to extracts links that point to other HTML pages and include them in cvs file. All links should be in absolute format.

Below are two different codes and none of them gives me correct answer (it should be 118 links).

Can you please advise me in what i need to add/correct

CODE 1:

from bs4 import BeautifulSoup

import urllib.request

import urllib.parse

from urllib.parse import urljoin

import requests

import csv

#Scrape website, specify the url, send the requests and catch the response

source='https://www.census.gov/programs-surveys/popest.html'

url_results=requests.get (source)

#Parse the HTML using BeautifulSoup to get the content from the website

soup=BeautifulSoup(url_results.content,'html.parser')

print (soup.prettify())

#Number of retreived URLs

all_links=soup.find_all('a')

for link in soup.find_all ('a', href=True):

href=link.get('href')

urls = []

for link in soup.find_all('a', href=True):

urls.append(link['href'])

#deduplicate links

urls_list=[]

for link in urls:

if link not in urls_list:

urls_list.append(link)

#printing deduplicated links

for link in urls_list:

print(link)

#Create a file to write to

with open("outputsep0903.csv", "w",newline="") as f:

cw = csv.writer(f,delimiter=" ")

cw.writerow(urls_list)

f.close()

CODE 2:

from bs4 import BeautifulSoup

import urllib.request

import urllib.parse

from urllib.parse import urljoin

import requests

import csv

#Scrape website, specify the url, send the requests and catch the response

source_url='https://www.census.gov/programs-surveys/popest.html'

url_results=requests.get (source_url)

#Parse the HTML using BeautifulSoup

soup=BeautifulSoup(url_results.content,'html.parser')

print (soup.prettify())

#Counting links, loop created to print out all tags, all links to html pages

all_links=soup.find_all('a')

for link in all_links:

link.get('href')

print ("All links:",len(all_links))

#Unique links, absolute path, deduplication

def unique_links (all_links, source_url):

output_links=set()

for link in all_links:

link=link.get ('href')

if link is None:

continue

if link.endswith ('/') or link.endswith('#'):

link=link[-1]

actual_url=urllib.parse.urljoin(source_url,link)

output_links.add (actual_url)

return output_links

output_links=unique_links(all_links,source_url)

print ('Number of not duplicated links:',len(output_links))

#Create a file to write to

with open("outputsep3.csv", "w",newline="") as f:

cw = csv.writer(f,delimiter=" ")

cw.writerow(output_links)

f.close()

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Programming Questions!