diff --git a/.gitignore b/.gitignore index 71f56d9..c46238f 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ pinscrape.egg-info **/*.pyc **/pinscrape/*.pyc venv -output \ No newline at end of file +output +.idea \ No newline at end of file diff --git a/pinscrape/_version.py b/pinscrape/_version.py index e94f36f..f5f41e5 100644 --- a/pinscrape/_version.py +++ b/pinscrape/_version.py @@ -1 +1 @@ -__version__ = "3.0.5" +__version__ = "3.1.0" diff --git a/pinscrape/pinscrape.py b/pinscrape/pinscrape.py index c7618bc..e75e5d4 100644 --- a/pinscrape/pinscrape.py +++ b/pinscrape/pinscrape.py @@ -1,4 +1,3 @@ -import re import json import os import cv2 @@ -19,64 +18,54 @@ def __init__(self): # ---------------------------------------- GET GOOGLE RESULTS --------------------------------- @staticmethod - def get_pinterest_links(body, max_images): + def get_pinterest_links(body, max_images: int): searched_urls = [] html = soup(body, 'html.parser') - links = html.select('#main > div > div > div > a') + links = html.select('#b_results cite') for link in links: - link = link.get('href') - link = re.sub(r'/url\?q=', '', link) - if link[0] != "/" and "pinterest" in link: + link = link.text + if "pinterest" in link: searched_urls.append(link) - #stops adding links if the limit has been reached + # stops adding links if the limit has been reached if max_images is not None and max_images == len(searched_urls): break - return searched_urls # -------------------------- save json data from source code of given pinterest url ------------- - def get_source(self, url, proxies): + def get_source(self, url: str, proxies: dict) -> None: try: res = get(url, proxies=proxies) - except Exception as e: + except Exception: return html = soup(res.text, 'html.parser') - json_data = html.find_all("script", attrs={"id": "__PWS_DATA__"}) - for a in json_data: - self.json_data_list.append(a.string) + json_data = html.find_all("script", attrs={"id": "__PWS_INITIAL_PROPS__"}) + self.json_data_list.append(json.loads(json_data[0].string)) # --------------------------- READ JSON OF PINTEREST WEBSITE ---------------------- - def save_image_url(self, max_images): - url_list = [i for i in self.json_data_list if i.strip()] - if not len(url_list): - return url_list + def save_image_url(self, max_images: int) -> list: url_list = [] for js in self.json_data_list: try: - data = DotMap(json.loads(js)) + data = DotMap(js) urls = [] - for pin in data.props.initialReduxState.pins: - if isinstance(data.props.initialReduxState.pins[pin].images.get("orig"), list): - for i in data.props.initialReduxState.pins[pin].images.get("orig"): + for pin in data.initialReduxState.pins: + if isinstance(data.initialReduxState.pins[pin].images.get("orig"), list): + for i in data.initialReduxState.pins[pin].images.get("orig"): urls.append(i.get("url")) else: - urls.append(data.props.initialReduxState.pins[pin].images.get("orig").get("url")) + urls.append(data.initialReduxState.pins[pin].images.get("orig").get("url")) for url in urls: url_list.append(url) - - #if the maximum has been achieved, return early if max_images is not None and max_images == len(url_list): return list(set(url_list)) - - - except Exception as e: + except Exception: continue - + return list(set(url_list)) # ------------------------------ image hash calculation ------------------------- - def dhash(self, image, hashSize=8): + def dhash(self, image, hashSize: int = 8): resized = cv2.resize(image, (hashSize + 1, hashSize)) diff = resized[:, 1:] > resized[:, :-1] return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v]) @@ -108,19 +97,17 @@ def download(self, url_list, num_of_workers, output_folder): # -------------------------- get user keyword and google search for that keywords --------------------- @staticmethod def start_scraping(max_images, key=None, proxies={}): - assert key != None, "Please provide keyword for searching images" + assert key is not None, "Please provide keyword for searching images" keyword = key + " pinterest" keyword = keyword.replace("+", "%20") - url = f'http://www.google.co.in/search?hl=en&q={keyword}' + url = f'https://www.bing.com/search?q={keyword}&pq=messi+pinterest&first=1&FORM=PERE' res = get(url, proxies=proxies) - searched_urls = PinterestImageScraper.get_pinterest_links(res.content,max_images) + searched_urls = PinterestImageScraper.get_pinterest_links(res.content, max_images) return searched_urls, key.replace(" ", "_") - - def scrape(self, key=None, output_folder="", proxies={}, threads=10, max_images: int = None): - extracted_urls, keyword = PinterestImageScraper.start_scraping(max_images,key, proxies) - return_data = {} + def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, threads: int = 10, max_images: int = None) -> dict: + extracted_urls, keyword = PinterestImageScraper.start_scraping(max_images, key, proxies) self.unique_img = [] self.json_data_list = [] @@ -144,17 +131,18 @@ def scrape(self, key=None, output_folder="", proxies={}, threads=10, max_images: self.download(url_list, threads, out_folder) except KeyboardInterrupt: return return_data - + return_data["isDownloaded"] = True return return_data - + return return_data scraper = PinterestImageScraper() + if __name__ == "__main__": - details = scraper.scrape("messi", "output") + details = scraper.scrape("messi", "output", {}, 10, None) if details["isDownloaded"]: print("\nDownloading completed !!")