From d31ab23adaed62a6f2f4450e9c5b4509e16c5de1 Mon Sep 17 00:00:00 2001 From: Jesus Hernandez Date: Fri, 20 Sep 2024 09:24:17 +0000 Subject: [PATCH] feat: Add spider RequestDeduplicationMiddleware --- .../RequestDeduplicationMiddleware.php | 107 ++++++++++++++++++ .../RequestDeduplicationMiddlewareTest.php | 80 +++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 src/Spider/Middleware/RequestDeduplicationMiddleware.php create mode 100644 tests/Spider/Middleware/RequestDeduplicationMiddlewareTest.php diff --git a/src/Spider/Middleware/RequestDeduplicationMiddleware.php b/src/Spider/Middleware/RequestDeduplicationMiddleware.php new file mode 100644 index 0000000..6eacd43 --- /dev/null +++ b/src/Spider/Middleware/RequestDeduplicationMiddleware.php @@ -0,0 +1,107 @@ + false, + 'ignore_trailing_slashes' => true, + 'ignore_query_string' => false, + 'seen_uris_cache_size' => 10000, + ]; + } + + public function handleRequest(Request $request, Response $response): Request + { + $uri = $request->getUri(); + + if ($this->isDuplicatedUri($uri)) { + $this->logger->info( + '[RequestDeduplicationMiddleware] Dropping duplicate request', + ['uri' => $uri], + ); + + return $request->drop('Duplicate request'); + } + + return $request; + } + + private function isDuplicatedUri(string $uri): bool + { + $uriHash = $this->hashUri($uri); + + if (isset($this->seenUriHashesHits[$uriHash])) { + $this->seenUriHashesHits[$uriHash] += 1; + return true; + } + + $this->seenUriHashesHits[$uriHash] = 1; + $this->cacheEviction(); + return false; + } + + private function hashUri(string $uri): string + { + $replaceFlags = HTTP_URL_REPLACE; + $parts = parse_url($uri); + + if ($this->option('ignore_url_fragments')) { + $replaceFlags |= HTTP_URL_STRIP_FRAGMENT; + } + + if ($this->option('ignore_trailing_slashes') && isset($parts['path'])) { + $parts['path'] = rtrim($parts['path'], '/'); + } + + if ($this->option('ignore_query_string')) { + $replaceFlags |= HTTP_URL_STRIP_QUERY; + } + + $uri = http_build_url($uri, $parts, $replaceFlags); + return md5($uri); // Why md5? Because it's fast and short. + } + + private function cacheEviction(): void + { + if (count($this->seenUriHashesHits) <= $this->option('seen_uris_cache_size')) { + return; + } + + $averageHitCount = array_sum($this->seenUriHashesHits) / count($this->seenUriHashesHits); + $this->seenUriHashesHits = array_filter($this->seenUriHashesHits, fn($hitCount) => $hitCount > $averageHitCount); + + $this->logger->info( + '[RequestDeduplicationMiddleware] Cache eviction', + [ + 'average_hit_count' => $averageHitCount, + 'remaining_cache_size' => count($this->seenUriHashesHits), + ], + ); + } +} diff --git a/tests/Spider/Middleware/RequestDeduplicationMiddlewareTest.php b/tests/Spider/Middleware/RequestDeduplicationMiddlewareTest.php new file mode 100644 index 0000000..7dcc6c5 --- /dev/null +++ b/tests/Spider/Middleware/RequestDeduplicationMiddlewareTest.php @@ -0,0 +1,80 @@ +createMiddleware(); + + $processedRequest = $middleware + ->handleRequest($this->makeRequest($uri), $this->makeResponse()); + self::assertSame(false, $processedRequest->wasDropped()); + + $processedRequest = $middleware + ->handleRequest($this->makeRequest($uri), $this->makeResponse()); + self::assertSame(true, $processedRequest->wasDropped()); + } + + public function testCacheEviction(): void + { + $uri_a = 'http://localhost/a'; + $uri_b = 'http://localhost/b'; + $uri_c = 'http://localhost/c'; + $middleware = $this + ->createMiddleware(2); + + foreach(range(1, 3) as $index) { + $processedRequest = $middleware + ->handleRequest($this->makeRequest($uri_a), $this->makeResponse()); + } + $processedRequest = $middleware + ->handleRequest($this->makeRequest($uri_b), $this->makeResponse()); + + $processedRequest = $middleware + ->handleRequest($this->makeRequest($uri_a), $this->makeResponse()); + self::assertSame(true, $processedRequest->wasDropped()); + + $processedRequest = $middleware + ->handleRequest($this->makeRequest($uri_b), $this->makeResponse()); + self::assertSame(true, $processedRequest->wasDropped()); + + $processedRequest = $middleware + ->handleRequest($this->makeRequest($uri_c), $this->makeResponse()); + self::assertSame(false, $processedRequest->wasDropped()); // It needs the list of accessed URIs delete some entries. + + $processedRequest = $middleware + ->handleRequest($this->makeRequest($uri_b), $this->makeResponse()); + self::assertSame(false, $processedRequest->wasDropped()); // B was removed from list of duplicated requests. + + $processedRequest = $middleware + ->handleRequest($this->makeRequest($uri_a), $this->makeResponse()); + self::assertSame(true, $processedRequest->wasDropped()); // A was not removed as it was requested more times than the others. + } + + private function createMiddleware(?int $cacheSize = null): RequestDeduplicationMiddleware + { + $middleware = new RequestDeduplicationMiddleware(new FakeLogger()); + + if (null !== $cacheSize) { + $middleware->configure(['seen_uris_cache_size' => $cacheSize]); + } + + return $middleware; + } +}