Script to scrape all the links from a page and stores them in a csv file.
Usings httplib2, beautifulsoup, pandas and urlparse.
Usings httplib2, beautifulsoup, pandas and urlparse.
Code:
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
from urlparse import urlparse, urljoin
inputFile = 'archivelist.csv'
http = httplib2.Http()
csvInputDf = pd.DataFrame()
csvInputDf = pd.read_csv(inputFile)
if 'archive' in inputFile:
index_column = 'archive'
else:
index_column = csvinputDf.columns[0]
archives = csvInputDf[index_column]
for url in archives:
print 'url = ' + url
status, response = http.request(url)
links = []
parseUrl = urlparse(url)
domain ='{uri.scheme}://{uri.netloc}/'.format(uri=parseUrl)
for item in BeautifulSoup(response, 'html.parser', parse_only=SoupStrainer('a')):
if item.has_attr('href'):
linkParse = urlparse(item['href'])
#print item['href']
if linkParse.netloc=='':
link = urljoin(domain,item['href'])
else:
link = item['href']
links.append(link)
my_df = pd.DataFrame(links)
my_df.to_csv('LinkList.csv',index=False,header=False)