fix: now using bing for search and fix json structure to parse images

iamatulsingh · May 18, 2024 · 9c29aba · 9c29aba
1 parent c4901a4
commit 9c29aba
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 41 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,5 @@ pinscrape.egg-info
 **/*.pyc
 **/pinscrape/*.pyc
 venv
-output
+output
+.idea
diff --git a/pinscrape/_version.py b/pinscrape/_version.py
@@ -1 +1 @@
-__version__ = "3.0.5"
+__version__ = "3.1.0"
diff --git a/pinscrape/pinscrape.py b/pinscrape/pinscrape.py
@@ -1,4 +1,3 @@
-import re
 import json
 import os
 import cv2
@@ -19,64 +18,54 @@ def __init__(self):
 
     # ---------------------------------------- GET GOOGLE RESULTS ---------------------------------
     @staticmethod
-    def get_pinterest_links(body, max_images):
+    def get_pinterest_links(body, max_images: int):
         searched_urls = []
         html = soup(body, 'html.parser')
-        links = html.select('#main > div > div > div > a')
+        links = html.select('#b_results cite')
         for link in links:
-            link = link.get('href')
-            link = re.sub(r'/url\?q=', '', link)
-            if link[0] != "/" and "pinterest" in link:
+            link = link.text
+            if "pinterest" in link:
                 searched_urls.append(link)
-                #stops adding links if the limit has been reached
+                # stops adding links if the limit has been reached
                 if max_images is not None and max_images == len(searched_urls):
                     break
-
         return searched_urls
 
     # -------------------------- save json data from source code of given pinterest url -------------
-    def get_source(self, url, proxies):
+    def get_source(self, url: str, proxies: dict) -> None:
         try:
             res = get(url, proxies=proxies)
-        except Exception as e:
+        except Exception:
             return
         html = soup(res.text, 'html.parser')
-        json_data = html.find_all("script", attrs={"id": "__PWS_DATA__"})
-        for a in json_data:
-            self.json_data_list.append(a.string)
+        json_data = html.find_all("script", attrs={"id": "__PWS_INITIAL_PROPS__"})
+        self.json_data_list.append(json.loads(json_data[0].string))
 
     # --------------------------- READ JSON OF PINTEREST WEBSITE ----------------------
-    def save_image_url(self, max_images):
-        url_list = [i for i in self.json_data_list if i.strip()]
-        if not len(url_list):
-            return url_list
+    def save_image_url(self, max_images: int) -> list:
         url_list = []
         for js in self.json_data_list:
             try:
-                data = DotMap(json.loads(js))
+                data = DotMap(js)
                 urls = []
-                for pin in data.props.initialReduxState.pins:
-                    if isinstance(data.props.initialReduxState.pins[pin].images.get("orig"), list):
-                        for i in data.props.initialReduxState.pins[pin].images.get("orig"):
+                for pin in data.initialReduxState.pins:
+                    if isinstance(data.initialReduxState.pins[pin].images.get("orig"), list):
+                        for i in data.initialReduxState.pins[pin].images.get("orig"):
                             urls.append(i.get("url"))
                     else:
-                        urls.append(data.props.initialReduxState.pins[pin].images.get("orig").get("url"))
+                        urls.append(data.initialReduxState.pins[pin].images.get("orig").get("url"))
 
                 for url in urls:
                     url_list.append(url)
-
-                    #if the maximum has been achieved, return early
                     if max_images is not None and max_images == len(url_list):
                         return list(set(url_list))
-
-
-            except Exception as e:
+            except Exception:
                 continue
-        
+
         return list(set(url_list))
 
     # ------------------------------ image hash calculation -------------------------
-    def dhash(self, image, hashSize=8):
+    def dhash(self, image, hashSize: int = 8):
         resized = cv2.resize(image, (hashSize + 1, hashSize))
         diff = resized[:, 1:] > resized[:, :-1]
         return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])
@@ -108,19 +97,17 @@ def download(self, url_list, num_of_workers, output_folder):
     # -------------------------- get user keyword and google search for that keywords ---------------------
     @staticmethod
     def start_scraping(max_images, key=None, proxies={}):
-        assert key != None, "Please provide keyword for searching images"
+        assert key is not None, "Please provide keyword for searching images"
         keyword = key + " pinterest"
         keyword = keyword.replace("+", "%20")
-        url = f'http://www.google.co.in/search?hl=en&q={keyword}'
+        url = f'https://www.bing.com/search?q={keyword}&pq=messi+pinterest&first=1&FORM=PERE'
         res = get(url, proxies=proxies)
-        searched_urls = PinterestImageScraper.get_pinterest_links(res.content,max_images)
+        searched_urls = PinterestImageScraper.get_pinterest_links(res.content, max_images)
 
         return searched_urls, key.replace(" ", "_")
 
-
-    def scrape(self, key=None, output_folder="", proxies={}, threads=10, max_images: int = None):
-        extracted_urls, keyword = PinterestImageScraper.start_scraping(max_images,key, proxies)
-        return_data = {}
+    def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, threads: int = 10, max_images: int = None) -> dict:
+        extracted_urls, keyword = PinterestImageScraper.start_scraping(max_images, key, proxies)
         self.unique_img = []
         self.json_data_list = []
 
@@ -144,17 +131,18 @@ def scrape(self, key=None, output_folder="", proxies={}, threads=10, max_images:
                 self.download(url_list, threads, out_folder)
             except KeyboardInterrupt:
                 return return_data
-            
+
             return_data["isDownloaded"] = True
             return return_data
-        
+
         return return_data
 
 
 scraper = PinterestImageScraper()
 
+
 if __name__ == "__main__":
-    details = scraper.scrape("messi", "output")
+    details = scraper.scrape("messi", "output", {}, 10, None)
 
     if details["isDownloaded"]:
         print("\nDownloading completed !!")
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,4 +8,5 @@ pinscrape.egg-info @@
     **/*.pyc
     **/pinscrape/*.pyc
     venv
-    output
+    output
+    .idea