From 93479c37c542a07ecdb0481e2b5e4947c592f609 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Thu, 16 Feb 2017 20:39:22 +0000 Subject: [PATCH] Initial commit --- .gitignore | 2 + composer.json | 20 ++ composer.lock | 518 ++++++++++++++++++++++++++++++++++ readme.md | 13 + src/SearchFailedException.php | 27 ++ src/SearchResult.php | 39 +++ src/Searcher.php | 144 ++++++++++ 7 files changed, 763 insertions(+) create mode 100644 .gitignore create mode 100644 composer.json create mode 100644 composer.lock create mode 100644 readme.md create mode 100644 src/SearchFailedException.php create mode 100644 src/SearchResult.php create mode 100644 src/Searcher.php diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ecdf2d7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +vendor +.idea diff --git a/composer.json b/composer.json new file mode 100644 index 0000000..666c84f --- /dev/null +++ b/composer.json @@ -0,0 +1,20 @@ +{ + "name": "room11/google-searcher", + "description": "Retrieves results from the first page of a Google search", + "minimum-stability": "dev", + "license": "MIT", + "authors": [ + { + "name": "Chris Wright", + "email": "github@daverandom.com" + } + ], + "autoload": { + "psr-4": {"Room11\\GoogleSearcher\\": "src/"} + }, + "require": { + "php": ">=7.1", + "amphp/artax": "^2", + "room11/dom-utils": "~1.0" + } +} diff --git a/composer.lock b/composer.lock new file mode 100644 index 0000000..be6fc6b --- /dev/null +++ b/composer.lock @@ -0,0 +1,518 @@ +{ + "_readme": [ + "This file locks the dependencies of your project to a known state", + "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", + "This file is @generated automatically" + ], + "hash": "2d4b70fd6270ef3552c2cf4d1ac646df", + "content-hash": "173e184a4758efd87bb14c67dd64712b", + "packages": [ + { + "name": "amphp/amp", + "version": "v1.x-dev", + "source": { + "type": "git", + "url": "https://github.com/amphp/amp.git", + "reference": "ba8335a3296a499a655d4ea18510484ee3e1310a" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/amphp/amp/zipball/ba8335a3296a499a655d4ea18510484ee3e1310a", + "reference": "ba8335a3296a499a655d4ea18510484ee3e1310a", + "shasum": "" + }, + "require": { + "php": ">=5.5" + }, + "require-dev": { + "fabpot/php-cs-fixer": "~1.9", + "phpunit/phpunit": "~4.8" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "Amp\\": "lib/" + }, + "files": [ + "lib/functions.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Daniel Lowrey", + "email": "rdlowrey@php.net", + "role": "Creator / Lead Developer" + } + ], + "description": "A non-blocking concurrency framework for PHP applications", + "homepage": "https://github.com/amphp/amp", + "keywords": [ + "async", + "concurrency", + "event", + "non-blocking", + "promise" + ], + "time": "2016-09-04 12:46:12" + }, + { + "name": "amphp/artax", + "version": "dev-master", + "source": { + "type": "git", + "url": "https://github.com/amphp/artax.git", + "reference": "99b72e4523c6e1e27d39de3a8b422767bcaa48ea" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/amphp/artax/zipball/99b72e4523c6e1e27d39de3a8b422767bcaa48ea", + "reference": "99b72e4523c6e1e27d39de3a8b422767bcaa48ea", + "shasum": "" + }, + "require": { + "amphp/amp": "^1", + "amphp/socket": "^0.9", + "php": ">=5.5.0" + }, + "require-dev": { + "friendsofphp/php-cs-fixer": "~1.9", + "phpunit/phpunit": "~4.8" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.x-dev", + "dev-1.x": "1.x-dev" + } + }, + "autoload": { + "psr-4": { + "Amp\\Artax\\": "lib/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Daniel Lowrey", + "email": "rdlowrey@gmail.com", + "role": "Creator / Lead Developer" + } + ], + "description": "Asynchronous parallel HTTP/1.1 client built on the Amp concurrency framework", + "homepage": "https://github.com/amphp/artax", + "keywords": [ + "async", + "client", + "http", + "non-blocking", + "parallel", + "rest" + ], + "time": "2016-11-27 17:09:01" + }, + { + "name": "amphp/cache", + "version": "v0.1.0", + "source": { + "type": "git", + "url": "https://github.com/amphp/cache.git", + "reference": "26709c198dcee686557801eda6d9345f3cfa8874" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/amphp/cache/zipball/26709c198dcee686557801eda6d9345f3cfa8874", + "reference": "26709c198dcee686557801eda6d9345f3cfa8874", + "shasum": "" + }, + "require": { + "amphp/amp": "^1" + }, + "require-dev": { + "fabpot/php-cs-fixer": "~1.9", + "phpunit/phpunit": "~4.8" + }, + "suggest": { + "amphp/redis": "For redis cache driver support" + }, + "type": "library", + "autoload": { + "psr-4": { + "Amp\\Cache\\": "lib/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Niklas Keller", + "email": "me@kelunik.com" + }, + { + "name": "Daniel Lowrey", + "email": "rdlowrey@php.net" + } + ], + "description": "A promise-aware caching API built on the amp concurrency framework", + "homepage": "https://github.com/amphp/cache", + "time": "2015-09-08 22:26:20" + }, + { + "name": "amphp/dns", + "version": "v0.8.14", + "source": { + "type": "git", + "url": "https://github.com/amphp/dns.git", + "reference": "5fc1cde2d29e94d731ab96d5ef5f9f20958315cc" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/amphp/dns/zipball/5fc1cde2d29e94d731ab96d5ef5f9f20958315cc", + "reference": "5fc1cde2d29e94d731ab96d5ef5f9f20958315cc", + "shasum": "" + }, + "require": { + "amphp/amp": "^1", + "amphp/cache": "^0.1", + "amphp/file": "^0.1", + "amphp/windows-registry": "^0.2.2", + "daverandom/libdns": "^1", + "php": ">=5.5" + }, + "require-dev": { + "friendsofphp/php-cs-fixer": "^1.9", + "phpunit/php-code-coverage": ">=2.2", + "phpunit/phpunit": "^4.8|^5.1.3" + }, + "type": "library", + "autoload": { + "psr-4": { + "Amp\\Dns\\": "lib" + }, + "files": [ + "lib/constants.php", + "lib/functions.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Bob Weinand", + "email": "bobwei9@hotmail.com" + }, + { + "name": "Niklas Keller", + "email": "me@kelunik.com" + }, + { + "name": "Daniel Lowrey", + "email": "rdlowrey@php.net" + }, + { + "name": "Chris Wright", + "email": "addr@daverandom.com" + } + ], + "description": "Async DNS resolution built on the amp concurrency framework", + "homepage": "https://github.com/amphp/dns", + "keywords": [ + "amp", + "async", + "client", + "dns", + "resolve" + ], + "time": "2017-02-05 22:17:40" + }, + { + "name": "amphp/file", + "version": "v0.1.3", + "source": { + "type": "git", + "url": "https://github.com/amphp/file.git", + "reference": "6612ae6757d4719492ed8b34ea6181ff67cfbed1" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/amphp/file/zipball/6612ae6757d4719492ed8b34ea6181ff67cfbed1", + "reference": "6612ae6757d4719492ed8b34ea6181ff67cfbed1", + "shasum": "" + }, + "require": { + "amphp/amp": "^1", + "php": ">=5.5" + }, + "require-dev": { + "friendsofphp/php-cs-fixer": "~1.9", + "phpunit/phpunit": "~4.8" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "0.1.0-dev", + "dev-amp_v2": "0.2.0-dev" + } + }, + "autoload": { + "psr-4": { + "Amp\\File\\": "lib" + }, + "files": [ + "lib/functions.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Daniel Lowrey", + "email": "rdlowrey@php.net" + } + ], + "description": "An async filesystem library built on the amp concurrency framework", + "homepage": "https://github.com/amphp/file", + "keywords": [ + "amp", + "amphp", + "async", + "disk", + "file", + "non-blocking", + "static" + ], + "time": "2016-10-01 17:43:52" + }, + { + "name": "amphp/process", + "version": "v0.1.3", + "source": { + "type": "git", + "url": "https://github.com/amphp/process.git", + "reference": "f22cca2af36e442b771c0de2e24e8025550d8ffc" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/amphp/process/zipball/f22cca2af36e442b771c0de2e24e8025550d8ffc", + "reference": "f22cca2af36e442b771c0de2e24e8025550d8ffc", + "shasum": "" + }, + "require": { + "amphp/amp": "^1" + }, + "require-dev": { + "fabpot/php-cs-fixer": "~1.9", + "phpunit/phpunit": "^4.8" + }, + "type": "library", + "autoload": { + "classmap": [ + { + "Amp\\Process": "Process.php" + } + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Bob Weinand", + "email": "bobwei9@hotmail.com" + } + ], + "description": "Asynchronous process manager", + "homepage": "https://github.com/amphp/process", + "time": "2016-09-24 10:49:26" + }, + { + "name": "amphp/socket", + "version": "v0.9.9", + "source": { + "type": "git", + "url": "https://github.com/amphp/socket.git", + "reference": "722614608c1de7099661187fad4e15c876816db1" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/amphp/socket/zipball/722614608c1de7099661187fad4e15c876816db1", + "reference": "722614608c1de7099661187fad4e15c876816db1", + "shasum": "" + }, + "require": { + "amphp/amp": "^1", + "amphp/dns": "^0.8", + "php": ">=5.5" + }, + "require-dev": { + "fabpot/php-cs-fixer": "~1.9", + "phpunit/phpunit": "~4.8" + }, + "type": "library", + "autoload": { + "psr-4": { + "Amp\\Socket\\": "lib/" + }, + "files": [ + "lib/functions.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Daniel Lowrey", + "email": "rdlowrey@gmail.com" + } + ], + "description": "Async socket connection tools for the amp concurrency framework", + "homepage": "https://github.com/amphp/socket", + "keywords": [ + "amp", + "async", + "encryption", + "non-blocking", + "sockets", + "tcp", + "tls" + ], + "time": "2016-07-18 22:03:24" + }, + { + "name": "amphp/windows-registry", + "version": "v0.2.2", + "source": { + "type": "git", + "url": "https://github.com/amphp/windows-registry.git", + "reference": "e4420eb368008c8fe81c0b481506306272cc3d21" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/amphp/windows-registry/zipball/e4420eb368008c8fe81c0b481506306272cc3d21", + "reference": "e4420eb368008c8fe81c0b481506306272cc3d21", + "shasum": "" + }, + "require": { + "amphp/amp": "^1.2", + "amphp/process": "^0.1.3", + "php": ">=5.5" + }, + "type": "library", + "autoload": { + "psr-4": { + "Amp\\WindowsRegistry\\": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Niklas Keller", + "email": "me@kelunik.com" + } + ], + "description": "Windows Registry Reader.", + "time": "2017-01-04 23:52:31" + }, + { + "name": "daverandom/libdns", + "version": "v1.1.0", + "source": { + "type": "git", + "url": "https://github.com/DaveRandom/LibDNS.git", + "reference": "67de5497e84d179dffc089c5eb6a1945eb4e9460" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/DaveRandom/LibDNS/zipball/67de5497e84d179dffc089c5eb6a1945eb4e9460", + "reference": "67de5497e84d179dffc089c5eb6a1945eb4e9460", + "shasum": "" + }, + "require": { + "php": ">=5.4.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "LibDNS\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "description": "DNS protocol implementation written in pure PHP", + "keywords": [ + "dns" + ], + "time": "2016-04-29 20:47:45" + }, + { + "name": "room11/dom-utils", + "version": "v1.1.1", + "source": { + "type": "git", + "url": "https://github.com/Room-11/DOMUtils.git", + "reference": "724529187818c52e895914abc4969de11118093c" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Room-11/DOMUtils/zipball/724529187818c52e895914abc4969de11118093c", + "reference": "724529187818c52e895914abc4969de11118093c", + "shasum": "" + }, + "type": "library", + "autoload": { + "psr-4": { + "Room11\\DOMUtils\\": "src/" + }, + "files": [ + "src/functions.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Chris Wright" + } + ], + "description": "Utility functions for the PHP DOM extension", + "time": "2016-10-19 15:15:02" + } + ], + "packages-dev": [], + "aliases": [], + "minimum-stability": "dev", + "stability-flags": [], + "prefer-stable": false, + "prefer-lowest": false, + "platform": { + "php": ">=7.1" + }, + "platform-dev": [] +} diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..825a92f --- /dev/null +++ b/readme.md @@ -0,0 +1,13 @@ +# Google Searcher + +Simple Google page 1 scraper. Do not use. + +## Required PHP Version + +- PHP 7.1+ + +## Installation + +```bash +$ composer require room11/google-searcher +``` diff --git a/src/SearchFailedException.php b/src/SearchFailedException.php new file mode 100644 index 0000000..7107645 --- /dev/null +++ b/src/SearchFailedException.php @@ -0,0 +1,27 @@ +searchTerm = $searchTerm; + $this->searchUri = $searchUri; + } + + public function getSearchTerm(): string + { + return $this->searchTerm; + } + + public function getSearchUri(): string + { + return $this->searchUri; + } +} diff --git a/src/SearchResult.php b/src/SearchResult.php new file mode 100644 index 0000000..e7dd062 --- /dev/null +++ b/src/SearchResult.php @@ -0,0 +1,39 @@ +url = $url; + $this->title = $title; + $this->description = $description; + $this->date = $date; + } + + public function getUrl(): string + { + return $this->url; + } + + public function getTitle(): string + { + return $this->title; + } + + public function getDescription(): string + { + return $this->description; + } + + public function getDate(): ?\DateTimeImmutable + { + return $this->date; + } +} diff --git a/src/Searcher.php b/src/Searcher.php new file mode 100644 index 0000000..8349a4d --- /dev/null +++ b/src/Searcher.php @@ -0,0 +1,144 @@ + $term, + 'lr' => 'lang_en', + ]); + } + + private function parseDescription(string $description): array + { + if (!preg_match(self::PARSE_DESCRIPTION_REGEX, $description, $match)) { + return [$description, null]; + } + + return [ + $match[4], + \DateTimeImmutable::createFromFormat( + 'j M Y', + sprintf('%s %s %s', ltrim($match[1], '0'), $match[2], $match[3]) + ) + ]; + } + + /** + * @param \DOMNodeList $resultNodes + * @param \DOMXPath $xpath + * @return SearchResult[] + */ + private function getSearchResults(\DOMNodeList $resultNodes, \DOMXPath $xpath): array + { + $results = []; + + foreach ($resultNodes as $resultNode) { + $linkNodes = $xpath->query(".//h3/a", $resultNode); + + if (!$linkNodes->length) { + continue; + } + + /** @var \DOMElement $linkNode */ + $linkNode = $linkNodes->item(0); + + $descriptionNodes = $xpath->query('.//span[@class="st"]', $resultNode); + + $description = 'No description available'; + $date = null; + + if ($descriptionNodes->length !== 0) { + list($description, $date) = $this->parseDescription($descriptionNodes->item(0)->textContent); + } + + $results[] = new SearchResult($linkNode->getAttribute("href"), $linkNode->textContent, $description, $date); + } + + return $results; + } + + private function doSearch(string $term) + { + $uri = $this->getSearchURL($term); + + $request = (new HttpRequest) + ->setMethod('GET') + ->setUri($uri) + ->setHeader('User-Agent', self::USER_AGENT); + + /** @var HttpResponse $response */ + $response = yield $this->httpClient->request($request); + + if ($response->getStatus() !== 200) { + throw new SearchFailedException( + "Google responded with an HTTP status code of {$response->getStatus()}", + $term, $uri + ); + } + + if (preg_match('#charset\s*=\s*([^;]+)#i', trim(implode(', ', $response->getHeader('Content-Type'))), $match) + && !preg_match('/' . preg_quote(self::ENCODING, '/') . '/i', $match[1])) { + $body = iconv($match[1], self::ENCODING, $response->getBody()); + } + + if (empty($body)) { + $body = $response->getBody(); + } + + try { + $dom = domdocument_load_html($body); + } catch (LibXMLFatalErrorException $e) { + throw new SearchFailedException("Failed parsing response HTML", $term, $uri, $e); + } + + $xpath = new \DOMXPath($dom); + $resultNodes = $xpath->query('//*[' . xpath_html_class('g') . ']'); + + return $resultNodes->length > 0 + ? $this->getSearchResults($resultNodes, $xpath) + : []; + } + + public function __construct(HttpClient $httpClient) + { + $this->httpClient = $httpClient; + } + + /** + * @param string $term + * @return Promise + */ + public function search(string $term): Promise + { + return resolve($this->doSearch($term)); + } +}