Skip to content

Commit

Permalink
feat: added extracted_urls list to print urls got from search engine
Browse files Browse the repository at this point in the history
  • Loading branch information
iamatulsingh committed Jul 4, 2024
1 parent cc58da9 commit 89d48ee
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 12 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ details = pinscrape.scraper.scrape("messi", "output", {}, 10, 15)
if details["isDownloaded"]:
print("\nDownloading completed !!")
print(f"\nTotal urls found: {len(details['extracted_urls'])}")
print(f"\nTotal images downloaded (including duplicate images): {len(details['url_list'])}")
print(f"\nTotal images downloaded (including duplicate images): {len(details['urls_list'])}")
print(details)
else:
print("\nNothing to download !!")
print("\nNothing to download !!", details)
```

`scrape("messi", "output", {}, 10, 15)` <br/>
Expand Down
2 changes: 1 addition & 1 deletion e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def test_single_data():
if details["isDownloaded"]:
print("\nDownloading completed !!")
print(f"\nTotal urls found: {len(details['extracted_urls'])}")
print(f"\nTotal images downloaded (including duplicate images): {len(details['url_list'])}")
print(f"\nTotal images downloaded (including duplicate images): {len(details['urls_list'])}")
print(details)
else:
print("\nNothing to download !!", details)
Expand Down
2 changes: 1 addition & 1 deletion pinscrape/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.2.2"
__version__ = "3.2.3"
19 changes: 11 additions & 8 deletions pinscrape/pinscrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,17 @@ def __init__(self):
def get_pinterest_links(body, max_images: int):
searched_urls = []
html = soup(body, 'html.parser')
all_urls = []
links = html.select('#b_results cite')
for link in links:
link = link.text
all_urls.append(link)
if "pinterest" in link:
searched_urls.append(link)
# stops adding links if the limit has been reached
if max_images is not None and max_images == len(searched_urls):
break
return searched_urls
return searched_urls, all_urls

# -------------------------- save json data from source code of given pinterest url -------------
def get_source(self, url: str, proxies: dict) -> None:
Expand Down Expand Up @@ -108,34 +110,35 @@ def start_scraping(max_images, key=None, proxies: dict = {}):
keyword = keyword.replace("+", "%20")
url = f'https://www.bing.com/search?q={keyword}&first=1&FORM=PERE'
res = get(url, proxies=proxies, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"})
searched_urls = PinterestImageScraper.get_pinterest_links(res.content, max_images)
searched_urls, links = PinterestImageScraper.get_pinterest_links(res.content, max_images)

return searched_urls, key.replace(" ", "_"), res.status_code
return searched_urls, key.replace(" ", "_"), res.status_code, links

def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, threads: int = 10, max_images: int = None) -> dict:
extracted_urls, keyword, search_engine_status_code = PinterestImageScraper.start_scraping(max_images, key, proxies)
extracted_urls, keyword, search_engine_status_code, links = PinterestImageScraper.start_scraping(max_images, key, proxies)
self.unique_img = []
self.json_data_list = []

for i in extracted_urls:
self.get_source(i, proxies)

# get all urls of images and save in a list
url_list = self.save_image_url(max_images)
urls_list = self.save_image_url(max_images)

return_data = {
"isDownloaded": False,
"search_engine_status_code": search_engine_status_code,
"url_list": url_list,
"urls_list": urls_list,
"searched_urls": links,
"extracted_urls": extracted_urls,
"keyword": key
}

# download images from saved images url
if len(url_list):
if len(urls_list):
try:
out_folder = output_folder if output_folder else key
self.download(url_list, threads, out_folder)
self.download(urls_list, threads, out_folder)
except KeyboardInterrupt:
return return_data

Expand Down

0 comments on commit 89d48ee

Please sign in to comment.