Skip to content

Commit

Permalink
fix: now using bing for search and fix json structure to parse images
Browse files Browse the repository at this point in the history
  • Loading branch information
Atul_Singh committed May 18, 2024
1 parent c4901a4 commit 9c29aba
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 41 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ pinscrape.egg-info
**/*.pyc
**/pinscrape/*.pyc
venv
output
output
.idea
2 changes: 1 addition & 1 deletion pinscrape/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.0.5"
__version__ = "3.1.0"
66 changes: 27 additions & 39 deletions pinscrape/pinscrape.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import re
import json
import os
import cv2
Expand All @@ -19,64 +18,54 @@ def __init__(self):

# ---------------------------------------- GET GOOGLE RESULTS ---------------------------------
@staticmethod
def get_pinterest_links(body, max_images):
def get_pinterest_links(body, max_images: int):
searched_urls = []
html = soup(body, 'html.parser')
links = html.select('#main > div > div > div > a')
links = html.select('#b_results cite')
for link in links:
link = link.get('href')
link = re.sub(r'/url\?q=', '', link)
if link[0] != "/" and "pinterest" in link:
link = link.text
if "pinterest" in link:
searched_urls.append(link)
#stops adding links if the limit has been reached
# stops adding links if the limit has been reached
if max_images is not None and max_images == len(searched_urls):
break

return searched_urls

# -------------------------- save json data from source code of given pinterest url -------------
def get_source(self, url, proxies):
def get_source(self, url: str, proxies: dict) -> None:
try:
res = get(url, proxies=proxies)
except Exception as e:
except Exception:
return
html = soup(res.text, 'html.parser')
json_data = html.find_all("script", attrs={"id": "__PWS_DATA__"})
for a in json_data:
self.json_data_list.append(a.string)
json_data = html.find_all("script", attrs={"id": "__PWS_INITIAL_PROPS__"})
self.json_data_list.append(json.loads(json_data[0].string))

# --------------------------- READ JSON OF PINTEREST WEBSITE ----------------------
def save_image_url(self, max_images):
url_list = [i for i in self.json_data_list if i.strip()]
if not len(url_list):
return url_list
def save_image_url(self, max_images: int) -> list:
url_list = []
for js in self.json_data_list:
try:
data = DotMap(json.loads(js))
data = DotMap(js)
urls = []
for pin in data.props.initialReduxState.pins:
if isinstance(data.props.initialReduxState.pins[pin].images.get("orig"), list):
for i in data.props.initialReduxState.pins[pin].images.get("orig"):
for pin in data.initialReduxState.pins:
if isinstance(data.initialReduxState.pins[pin].images.get("orig"), list):
for i in data.initialReduxState.pins[pin].images.get("orig"):
urls.append(i.get("url"))
else:
urls.append(data.props.initialReduxState.pins[pin].images.get("orig").get("url"))
urls.append(data.initialReduxState.pins[pin].images.get("orig").get("url"))

for url in urls:
url_list.append(url)

#if the maximum has been achieved, return early
if max_images is not None and max_images == len(url_list):
return list(set(url_list))


except Exception as e:
except Exception:
continue

return list(set(url_list))

# ------------------------------ image hash calculation -------------------------
def dhash(self, image, hashSize=8):
def dhash(self, image, hashSize: int = 8):
resized = cv2.resize(image, (hashSize + 1, hashSize))
diff = resized[:, 1:] > resized[:, :-1]
return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])
Expand Down Expand Up @@ -108,19 +97,17 @@ def download(self, url_list, num_of_workers, output_folder):
# -------------------------- get user keyword and google search for that keywords ---------------------
@staticmethod
def start_scraping(max_images, key=None, proxies={}):
assert key != None, "Please provide keyword for searching images"
assert key is not None, "Please provide keyword for searching images"
keyword = key + " pinterest"
keyword = keyword.replace("+", "%20")
url = f'http://www.google.co.in/search?hl=en&q={keyword}'
url = f'https://www.bing.com/search?q={keyword}&pq=messi+pinterest&first=1&FORM=PERE'
res = get(url, proxies=proxies)
searched_urls = PinterestImageScraper.get_pinterest_links(res.content,max_images)
searched_urls = PinterestImageScraper.get_pinterest_links(res.content, max_images)

return searched_urls, key.replace(" ", "_")


def scrape(self, key=None, output_folder="", proxies={}, threads=10, max_images: int = None):
extracted_urls, keyword = PinterestImageScraper.start_scraping(max_images,key, proxies)
return_data = {}
def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, threads: int = 10, max_images: int = None) -> dict:
extracted_urls, keyword = PinterestImageScraper.start_scraping(max_images, key, proxies)
self.unique_img = []
self.json_data_list = []

Expand All @@ -144,17 +131,18 @@ def scrape(self, key=None, output_folder="", proxies={}, threads=10, max_images:
self.download(url_list, threads, out_folder)
except KeyboardInterrupt:
return return_data

return_data["isDownloaded"] = True
return return_data

return return_data


scraper = PinterestImageScraper()


if __name__ == "__main__":
details = scraper.scrape("messi", "output")
details = scraper.scrape("messi", "output", {}, 10, None)

if details["isDownloaded"]:
print("\nDownloading completed !!")
Expand Down

0 comments on commit 9c29aba

Please sign in to comment.