forked from Bazifrasool/RedditImageScraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathredditdownloader.py
153 lines (114 loc) · 3.96 KB
/
redditdownloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# Generated by Selenium IDE
import pytest
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import requests
import re
import os
import shutil
from concurrent.futures import ThreadPoolExecutor
import threading
class RedditImageDownloader():
def setup_method(self,subreddit="wallpapers/",path="./"):
chrome_options = ChromeOptions()
chrome_options.add_argument("--disable-notifications")
self.driver = webdriver.Chrome(options=chrome_options)
self.vars = {}
self.subreddit=subreddit
#create a directory if not created, if created ,delete old and create new and cwd
try:
os.mkdir(path+self.subreddit)
except:
shutil.rmtree(path+self.subreddit)
os.mkdir(path+self.subreddit)
finally:
os.chdir(path+self.subreddit)
def teardown_method(self):
self.driver.quit()
def image_getter(self,link,i):
#print(link)
response = requests.get(link)
file=None
if("jpg" in link):
file = open(self.subreddit+str(i)+".jpg", "wb")
elif("png" in link):
file = open(self.subreddit+str(i)+".png", "wb")
file.write(response.content)
file.close()
def cleaner(self,imm_list):
link_processed=[]
for each in imm_list:
pic_name = re.findall("[t][\/]([\S]*[.][\S][\S][g]).*",each)
link_processed.append("https://i.redd.it/"+pic_name[0])
return(link_processed)
def download(self,speed):
num_worker=speed/5
self.driver.get("https://www.reddit.com/r/"+self.subreddit)
# 2 | setWindowSize | 848x1040 |
self.driver.set_window_size(848, 1040)
# 3 | click | css=.i2sTp1duDdXdwoKi1l8ED |
try:
self.driver.find_element(By.CSS_SELECTOR, ".i2sTp1duDdXdwoKi1l8ED").click()
except:
print("no prompt")
#Loop to Bottom
current_pos=20
new_pos=5
attempts=4
time.sleep(2)
i=0
while(current_pos!=new_pos):
current_pos = self.driver.execute_script("return document.documentElement.scrollTop;")
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
new_pos = self.driver.execute_script("return document.documentElement.scrollTop;")
time.sleep(1)
if(current_pos!=new_pos):
i=0
if(i<attempts and current_pos==new_pos):
i+=1
time.sleep(1)
current_pos=20
new_pos=5
links=[]
clean_links=[]
try:
link_elements =self.driver.find_elements_by_class_name("_13svhQIUZqD9PVzFcLwOKT")
for each in link_elements:
links.append(each.get_attribute("href"))
except:
print("No external Links found")
try:
link_elements =self.driver.find_elements_by_class_name("_2_tDEnGMLxpM6uOa2kaDB3")
imm_links=[]
#print(imm_links)
for each in link_elements:
imm_links.append(each.get_attribute("src"))
j=0
while(j<len(imm_links)):
if("external" in imm_links[j]):
imm_links.pop(j)
j=0
else:
j+=1
#print(links)
clean_links = self.cleaner(imm_links)
except:
print("No internal Links found")
links.extend(clean_links)
with ThreadPoolExecutor(max_workers=num_worker) as executor:
for i,each in enumerate(links):
executor.submit(self.image_getter,each,i)
os.chdir("./..")
def download_helper(subreddit="wallpapers/",path="./",i_speed=100):
st = RedditImageDownloader()
st.setup_method(subreddit=subreddit,path=path)
st.download(speed=i_speed)
st.teardown_method()