-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape-politifact.py
74 lines (67 loc) · 2.81 KB
/
scrape-politifact.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#Import the dependencies
from bs4 import BeautifulSoup
import pandas as pd
import requests
import urllib.request
import time
#Create lists to store the scraped data
authors = []
dates = []
statements = []
sources = []
targets = []
#Create a function to scrape the site
def scrape_website(page_number, source):
page_num = str(page_number) #Convert the page number to a string
'''source: a certain speaker only'''
URL = 'https://www.politifact.com/factchecks/list/?page={}&speaker={}'.format(page_num, source)
'''source: all'''
# URL = 'https://www.politifact.com/factchecks/list/?page='+page_num #append the page number to complete the URL
webpage = requests.get(URL) #Make a request to the website
#time.sleep(3)
soup = BeautifulSoup(webpage.text, "html.parser") #Parse the text from the website
#Get the tags and it's class
statement_footer = soup.find_all('footer',attrs={'class':'m-statement__footer'}) #Get the tag and it's class
statement_quote = soup.find_all('div', attrs={'class':'m-statement__quote'}) #Get the tag and it's class
statement_meta = soup.find_all('div', attrs={'class':'m-statement__meta'})#Get the tag and it's class
target = soup.find_all('div', attrs={'class':'m-statement__meter'}) #Get the tag and it's class
#loop through the footer class m-statement__footer to get the date and author
for i in statement_footer:
link1 = i.text.strip()
name_and_date = link1.split()
first_name = name_and_date[1]
last_name = name_and_date[2]
full_name = first_name+' '+last_name
month = name_and_date[4]
day = name_and_date[5]
year = name_and_date[6]
date = month+' '+day+' '+year
dates.append(date)
authors.append(full_name)
#Loop through the div m-statement__quote to get the link
for i in statement_quote:
link2 = i.find_all('a')
statements.append(link2[0].text.strip())
#Loop through the div m-statement__meta to get the source
for i in statement_meta:
link3 = i.find_all('a') #Source
source_text = link3[0].text.strip()
sources.append(source_text)
#Loop through the target or the div m-statement__meter to get the facts about the statement (True or False)
for i in target:
fact = i.find('div', attrs={'class':'c-image'}).find('img').get('alt')
targets.append(fact)
#Loop through 'n-1' webpages to scrape the data
n=2
for i in range(1, n):
scrape_website(i, source='joe-biden')
#Create a new dataFrame
data = pd.DataFrame(columns = ['author', 'statement', 'source', 'date', 'target'])
data['author'] = authors
data['statement'] = statements
data['source'] = sources
data['date'] = dates
data['target'] = targets
#Show the data set
print(data)
data.iloc[:5].to_csv('politifact-joe-biden-sample.csv', index=False, sep=',')