forked from zccwqdoorchid/IKEA-meatballs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_scraping.py
175 lines (144 loc) · 6.63 KB
/
web_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# -*- coding: utf-8 -*-
import json
from bs4 import BeautifulSoup
import requests
import shutil
import validators
data_file_path = "./static/scraped_data.json"
'''
FUNCTION INPUT:
The function below works for the Tate gallery website (debugged & tested)
To extract gallery info from Tate, you can define the following variable (type = dictionary) first:
gallery_2_url = {'Tate':'https://www.tate.org.uk/whats-on'}
and use it as the input when calling function 'extract_gallery_info(gallery_2_url)':
'''
'''
FUNCTION OUTPUTS:
the function outputs 6 lists, including:
all exhibitions'...
1. titles
2. dates
3. locations
4. brief intro
5. articles
6. url for EACH exhibition's page
NOTE: any missing info will be stored as None type in the lists;
thus,in all 5 lists, the same index number should point to the SAME exhibition!
'''
def load_data_from_file(data_file_path):
try:
with open(data_file_path, 'r') as file:
data = json.load(file)
return data
except FileNotFoundError:
return None
def save_data_to_file(data, data_file_path):
with open(data_file_path, 'w') as file:
json.dump(data, file)
def extract_gallery_info(gallery_2_url):
# Check if data exists in the file
saved_data = load_data_from_file(data_file_path)
if saved_data:
return saved_data.values() # MAKE SURE TO RETURN VALUES OF THE DICTIONARIES IN .json DATA_FILE
# Data not found in file, proceed with scraping
# initialize 6 types of info from each exhibition
exhib_titles = [] # 1. titles
exhib_dates = [] # 2. dates
exhib_locations = [] # 3. locations
exhib_intro = [] # 4. brief intro
exhib_articles = [] # 5. article
exhib_urls = [] # 6. url for EACH exhibition's page
# loop through all exhibition overview pages (max 20 per page) -- OUTER LOOP
# and loop through all exhibitions on each of these pages -- INNER LOOP
# initialize current_url as the first page's url
current_url = gallery_2_url.get('Tate')
# this is used to concatenate with href --> get valid new url as a whole
home_url = current_url[:23]
# OUTER LOOP
while current_url: # continue looping before reaching the end page
# reset current overview page
page = requests.get(current_url)
soup = BeautifulSoup(page.text, 'html.parser')
# get all exhibition info on current overview page
mixed_info = soup.find_all('div', class_ = 'card')
# reset current page hrefs(to each exhibition's own page)
current_page_hrefs = []
# 4. brief intros for each exhibition
for i in mixed_info:
info_chunck = i.find_all('div', class_ = "card__description")
if not info_chunck: # exhibition without intro info
exhib_intro.append(None)
else:
for intro in info_chunck:
intro = intro.text.strip().replace(u'\xa0', u' ')
exhib_intro.append(intro)
# 6. collect url for each exhibition's own page
for i in mixed_info:
info_chunck = i.find_all('a')
for j in info_chunck:
# CHECK THAT URL IS WELLFORMED HERE
if validators.url(j.attrs['href']):
new_url = j.attrs['href']
elif validators.url(home_url+j.attrs['href']):
new_url = home_url+j.attrs['href']
# prevent url repetitions
if new_url not in current_page_hrefs:
current_page_hrefs.append(new_url) # update current page hrefs
exhib_urls.append(new_url) # accumulate all urls
# INNER LOOP
# extract from each exhibition on their seperate web page
for href in current_page_hrefs:
page2 = requests.get(href)
soup2 = BeautifulSoup(page2.text, 'html.parser')
# 1. title
title = soup2.find('h1', class_ = "splash-header__fulltitle")
if title: # content not empty
title = title.text.strip()
exhib_titles.append(title)
# 2. date
date = soup2.find('span',class_ = "splash-header__dates")
if date: # content not empty
date = date.get_text(strip = True,separator = "; ")
else:
date = soup2.find('div',class_="content__info-dates") # for some pages, date is displayed in different format
if date: # content not empty
date = date.get_text(strip = True,separator = "; ")
exhib_dates.append(date)
# 3. location
location = soup2.find('span', class_ = "splash-header__venue")
if location: # content not empty
location = location.text.strip()
else:
location = soup2.find('div',class_="content__info-location") # for some pages, location is displayed in different format
if location: # content not empty
if location.find('p'):
location = location.find('p').get_text(strip = True,separator = " - ")
location = location.split(" - ")[0]
exhib_locations.append(location)
# 5. article
article = soup2.find('div', class_ = "block-rich_text")
if article: # content not empty
article = article.text.strip()
exhib_articles.append(article)
# get url for exhibitions on the next page
next_page = soup.find_all('li', class_ = 'pager__item pager__item--next')
# check if reaches the end of all pages & update current page url accordingly
# CHECK THAT URL IS WELLFORMED
if next_page:
if validators.url(next_page[0].find('a').attrs['href']):
current_url = next_page[0].find('a').attrs['href']
elif validators.url(home_url + next_page[0].find('a').attrs['href']):
current_url = home_url + next_page[0].find('a').attrs['href']
else:
current_url = ''
# After scraping, save the data to the file
data = {
'titles': exhib_titles,
'dates': exhib_dates,
'locations': exhib_locations,
'intro': exhib_intro,
'articles': exhib_articles,
'urls': exhib_urls
}
save_data_to_file(data, data_file_path)
return exhib_titles, exhib_dates, exhib_locations, exhib_intro, exhib_articles, exhib_urls