-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
84 lines (60 loc) · 2.47 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import argparse
import logging
import datetime
import csv
logging.basicConfig(level=logging.INFO)
import re
import news_page_objects as news
from common import config
from requests.exceptions import HTTPError
from urllib3.exceptions import MaxRetryError
logger = logging.getLogger(__name__)
is_well_formed_link = re.compile(r'^https?://.+/.+$') #https://ejemplitos.com/img
is_root_path = re.compile(r'^/.+$') # /some-text
def _news_scraper(news_site_uid):
host = config()['news_sites'][news_site_uid]['url']
logging.info('Begining scraper for {}'.format(host))
homepage = news.HomePage(news_site_uid, host)
articles = []
for link in homepage.article_links:
article = _fetch_article(news_site_uid, host, link)
if article:
logger.info('Article fetched')
articles.append(article)
print(article.title)
break
_save_articles(news_site_uid, articles)
def _save_articles(news_site_uid, articles):
now = datetime.datetime.now().strftime('%Y_%m_%d')
out_file_name = '{news_site_uid}_{datetime}_articles.csv'.format(news_site_uid=news_site_uid, datetime=now)
csv_headers = list(filter(lambda property: not property.startswith('_'), dir(articles[0])))
with open(out_file_name, mode='w+') as f:
writer = csv.writer(f)
writer.writerow(csv_headers)
for article in articles:
row = [str(getattr(article, prop))for prop in csv_headers]
writer.writerow(row)
def _fetch_article(news_site_uid, host, link):
logger.info('Start fetching article at {}'.format(link))
article = None
try:
article = news.ArticlePage(news_site_uid, _build_link(host, link))
except (HTTPError, MaxRetryError) as e:
logger.warning('Error while fetching the article', exc_info=False)
if article and not article.body:
logger.warning('Invalid article. There is no body')
return None
return article
def _build_link(host, link):
if is_well_formed_link.match(link):
return link
elif is_root_path.match(link):
return '{}{}'.format(host, link)
else:
return '{host}/{uri}'.format(host=host, uri=link)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
news_site_choices = list(config()['news_sites'].keys())
parser.add_argument('news_site', help='The news site you want to scrape', type=str, choices=news_site_choices)
args = parser.parse_args()
_news_scraper(args.news_site)