@@ -234,33 +234,27 @@ public function analyze($url)
234234 // Reset das regras ativadas para nova análise
235235 $ this ->activatedRules = [];
236236
237- // 1. Clean URL / Limpa a URL
238- $ cleanUrl = $ this ->cleanUrl ($ url );
239- if (!$ cleanUrl ) {
240- $ this ->throwError (self ::ERROR_INVALID_URL );
241- }
242-
243- // 2. Check cache / Verifica cache
244- if ($ this ->cache ->exists ($ cleanUrl )) {
245- return $ this ->cache ->get ($ cleanUrl );
237+ // 1. Check cache / Verifica cache
238+ if ($ this ->cache ->exists ($ url )) {
239+ return $ this ->cache ->get ($ url );
246240 }
247241
248- // 3 . Check blocked domains / Verifica domínios bloqueados
249- $ host = parse_url ($ cleanUrl , PHP_URL_HOST );
242+ // 2 . Check blocked domains / Verifica domínios bloqueados
243+ $ host = parse_url ($ url , PHP_URL_HOST );
250244 if (!$ host ) {
251245 $ this ->throwError (self ::ERROR_INVALID_URL );
252246 }
253247 $ host = preg_replace ('/^www\./ ' , '' , $ host );
254248
255249 if (in_array ($ host , BLOCKED_DOMAINS )) {
256- Logger::getInstance ()->logUrl ($ cleanUrl , 'BLOCKED_DOMAIN ' );
250+ Logger::getInstance ()->logUrl ($ url , 'BLOCKED_DOMAIN ' );
257251 $ this ->throwError (self ::ERROR_BLOCKED_DOMAIN );
258252 }
259253
260- // Check URL status code before proceeding
261- $ redirectInfo = $ this ->checkStatus ($ cleanUrl );
254+ // 3. Check URL status code before proceeding
255+ $ redirectInfo = $ this ->checkStatus ($ url );
262256 if ($ redirectInfo ['httpCode ' ] !== 200 ) {
263- Logger::getInstance ()->logUrl ($ cleanUrl , 'INVALID_STATUS_CODE ' , "HTTP {$ redirectInfo ['httpCode ' ]}" );
257+ Logger::getInstance ()->logUrl ($ url , 'INVALID_STATUS_CODE ' , "HTTP {$ redirectInfo ['httpCode ' ]}" );
264258 if ($ redirectInfo ['httpCode ' ] === 404 ) {
265259 $ this ->throwError (self ::ERROR_NOT_FOUND );
266260 } else {
@@ -279,33 +273,33 @@ public function analyze($url)
279273 $ content = null ;
280274 switch ($ fetchStrategy ) {
281275 case 'fetchContent ' :
282- $ content = $ this ->fetchContent ($ cleanUrl );
276+ $ content = $ this ->fetchContent ($ url );
283277 break ;
284278 case 'fetchFromWaybackMachine ' :
285- $ content = $ this ->fetchFromWaybackMachine ($ cleanUrl );
279+ $ content = $ this ->fetchFromWaybackMachine ($ url );
286280 break ;
287281 case 'fetchFromSelenium ' :
288- $ content = $ this ->fetchFromSelenium ($ cleanUrl , isset ($ domainRules ['browser ' ]) ? $ domainRules ['browser ' ] : 'firefox ' );
282+ $ content = $ this ->fetchFromSelenium ($ url , isset ($ domainRules ['browser ' ]) ? $ domainRules ['browser ' ] : 'firefox ' );
289283 break ;
290284 }
291285
292286 if (!empty ($ content )) {
293287 $ this ->activatedRules [] = "fetchStrategy: $ fetchStrategy " ;
294- $ processedContent = $ this ->processContent ($ content , $ host , $ cleanUrl );
295- $ this ->cache ->set ($ cleanUrl , $ processedContent );
288+ $ processedContent = $ this ->processContent ($ content , $ host , $ url );
289+ $ this ->cache ->set ($ url , $ processedContent );
296290 return $ processedContent ;
297291 }
298292 } catch (Exception $ e ) {
299- Logger::getInstance ()->logUrl ($ cleanUrl , strtoupper ($ fetchStrategy ) . '_ERROR ' , $ e ->getMessage ());
293+ Logger::getInstance ()->logUrl ($ url , strtoupper ($ fetchStrategy ) . '_ERROR ' , $ e ->getMessage ());
300294 throw $ e ;
301295 }
302296 }
303297
304298 // 5. Try all strategies in sequence
305299 $ fetchStrategies = [
306- ['method ' => 'fetchContent ' , 'args ' => [$ cleanUrl ]],
307- ['method ' => 'fetchFromWaybackMachine ' , 'args ' => [$ cleanUrl ]],
308- ['method ' => 'fetchFromSelenium ' , 'args ' => [$ cleanUrl , 'firefox ' ]]
300+ ['method ' => 'fetchContent ' , 'args ' => [$ url ]],
301+ ['method ' => 'fetchFromWaybackMachine ' , 'args ' => [$ url ]],
302+ ['method ' => 'fetchFromSelenium ' , 'args ' => [$ url , 'firefox ' ]]
309303 ];
310304
311305 $ lastError = null ;
@@ -314,8 +308,8 @@ public function analyze($url)
314308 $ content = call_user_func_array ([$ this , $ strategy ['method ' ]], $ strategy ['args ' ]);
315309 if (!empty ($ content )) {
316310 $ this ->activatedRules [] = "fetchStrategy: {$ strategy ['method ' ]}" ;
317- $ processedContent = $ this ->processContent ($ content , $ host , $ cleanUrl );
318- $ this ->cache ->set ($ cleanUrl , $ processedContent );
311+ $ processedContent = $ this ->processContent ($ content , $ host , $ url );
312+ $ this ->cache ->set ($ url , $ processedContent );
319313 return $ processedContent ;
320314 }
321315 } catch (Exception $ e ) {
@@ -326,7 +320,7 @@ public function analyze($url)
326320 }
327321
328322 // If we get here, all strategies failed
329- Logger::getInstance ()->logUrl ($ cleanUrl , 'GENERAL_FETCH_ERROR ' );
323+ Logger::getInstance ()->logUrl ($ url , 'GENERAL_FETCH_ERROR ' );
330324 if ($ lastError ) {
331325 $ message = $ lastError ->getMessage ();
332326 if (strpos ($ message , 'DNS ' ) !== false ) {
@@ -432,8 +426,8 @@ private function fetchContent($url)
432426 */
433427 private function fetchFromWaybackMachine ($ url )
434428 {
435- $ cleanUrl = preg_replace ('#^https?://# ' , '' , $ url );
436- $ availabilityUrl = "https://archive.org/wayback/available?url= " . urlencode ($ cleanUrl );
429+ $ url = preg_replace ('#^https?://# ' , '' , $ url );
430+ $ availabilityUrl = "https://archive.org/wayback/available?url= " . urlencode ($ url );
437431
438432 $ curl = new Curl ();
439433 $ curl ->setOpt (CURLOPT_FOLLOWLOCATION , true );
@@ -552,36 +546,6 @@ private function fetchFromSelenium($url, $browser = 'firefox')
552546 }
553547 }
554548
555- /**
556- * Clean and normalize a URL
557- * Limpa e normaliza uma URL
558- */
559- private function cleanUrl ($ url )
560- {
561- $ url = trim ($ url );
562-
563- if (!filter_var ($ url , FILTER_VALIDATE_URL )) {
564- return false ;
565- }
566-
567- if (preg_match ('#https://([^.]+)\.cdn\.ampproject\.org/v/s/([^/]+)(.*)# ' , $ url , $ matches )) {
568- $ url = 'https:// ' . $ matches [2 ] . $ matches [3 ];
569- }
570-
571- $ parts = parse_url ($ url );
572- if (!isset ($ parts ['scheme ' ]) || !isset ($ parts ['host ' ])) {
573- return false ;
574- }
575-
576- $ cleanedUrl = $ parts ['scheme ' ] . ':// ' . $ parts ['host ' ];
577-
578- if (isset ($ parts ['path ' ])) {
579- $ cleanedUrl .= $ parts ['path ' ];
580- }
581-
582- return $ cleanedUrl ;
583- }
584-
585549 /**
586550 * Get specific rules for a domain
587551 * Obtém regras específicas para um domínio
0 commit comments