Skip to content

Commit 2b842a7

Browse files
committed
adiciona suporte a requisição via selenium gurizada
1 parent a6fc451 commit 2b842a7

File tree

10 files changed

+118
-25
lines changed

10 files changed

+118
-25
lines changed

TEST_URLS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ piaui.folha.uol.com.br
55
jota.info
66
haaretz.com
77
haaretz.co.il
8-
gauchazh.clicrbs.com.br
98
economist.com
109
liberation.fr
1110
lesoir.be
@@ -66,6 +65,7 @@ https://www.agazeta.com.br/concursos-e-empregos/concursos/prefeitura-de-cariacic
6665
https://natelinha.uol.com.br/televisao/2024/12/05/boninho-fecha-com-o-sbt-para-novo-reality-show-219855.php
6766
https://gamarevista.uol.com.br/semana/deu-vontade-de-ter-outra-vida/novas-formas-abandonar/
6867
https://tecnoblog.net/noticias/cor-do-ano-esta-em-celulares-da-motorola-que-serao-vendidos-no-brasil/
68+
https://gauchazh.clicrbs.com.br/pioneiro/policia/noticia/2024/11/caxias-do-sul-podera-fazer-emprestimo-de-ate-us-40-milhoes-para-melhorias-tecnologicas-em-educacao-seguranca-e-servicos-municipais-cm3q9yn870051014fzz77djqz.html
6969

7070
## Internacional
7171
https://www.nytimes.com/2024/11/20/us/politics/matt-gaetz-venmo-payments-sex.html

app/.env.sample

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,7 @@ S3_BUCKET=
2727
S3_REGION=us-east-1
2828
S3_FOLDER=cache/
2929
S3_ACL=private
30-
S3_ENDPOINT=
30+
S3_ENDPOINT=
31+
32+
# Configurações do Selenium
33+
SELENIUM_HOST=localhost:4444

app/composer.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
"require": {
33
"vlucas/phpdotenv": "^5.6.1",
44
"aws/aws-sdk-php": "^3.0",
5-
"php-curl-class/php-curl-class": "^11.0"
5+
"php-curl-class/php-curl-class": "^11.0",
6+
"php-webdriver/webdriver": "^1.15"
67
},
78
"autoload": {
89
"psr-4": {

app/config.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@
2525
define('SITE_NAME', isset($_ENV['SITE_NAME']) ? $_ENV['SITE_NAME'] : 'Marreta');
2626
define('SITE_DESCRIPTION', isset($_ENV['SITE_DESCRIPTION']) ? $_ENV['SITE_DESCRIPTION'] : 'Chapéu de paywall é marreta!');
2727
define('SITE_URL', isset($_ENV['SITE_URL']) ? $_ENV['SITE_URL'] : 'https://' . $_SERVER['HTTP_HOST']);
28-
define('MAX_ATTEMPTS', 3); // Número máximo de tentativas para acessar uma URL
2928
define('DNS_SERVERS', isset($_ENV['DNS_SERVERS']) ? $_ENV['DNS_SERVERS'] : '1.1.1.1, 8.8.8.8');
3029
define('CACHE_DIR', __DIR__ . '/cache');
3130
define('DISABLE_CACHE', isset($_ENV['DISABLE_CACHE']) ? filter_var($_ENV['DISABLE_CACHE'], FILTER_VALIDATE_BOOLEAN) : false);
31+
define('SELENIUM_HOST', isset($_ENV['SELENIUM_HOST']) ? $_ENV['SELENIUM_HOST'] : 'localhost:4444');
3232

3333
/**
3434
* Configurações de Cache S3

app/data/blocked_domains.php

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
'ole.com.ar',
2424
//-- Bloqueio tecnico de acesso ao conteudo
2525
'bloomberg.com',
26-
'gauchazh.clicrbs.com.br',
2726
'opopular.com.br',
2827
'npr.org',
2928
'sportskeeda.com',

app/data/domain_rules.php

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
* 'scriptTagRemove' => ['gtm.js', 'ga.js'], // Exclui scripts específicos das regras globais
2323
* 'classElementRemove' => ['subscription'] // Exclui classes específicas das regras globais
2424
* ]
25+
* - useSelenium: Boolean indicando se deve usar Selenium para extração
2526
*/
2627
return [
2728
'nsctotal.com.br' => [
@@ -38,6 +39,14 @@
3839
],
3940
'classAttrRemove' => ['wall', 'protected-content', 'cropped-block']
4041
],
42+
'gauchazh.clicrbs.com.br' => [
43+
'classAttrRemove' => [' m-paid-content', 'paid-content-apply'],
44+
'scriptTagRemove' => ['vendors-', 'verdors-'],
45+
'excludeGlobalRules' => [
46+
'classElementRemove' => ['paid-content']
47+
],
48+
'useSelenium' => true
49+
],
4150
'foreignaffairs.com' => [
4251
'customCode' => 'document.addEventListener(\'DOMContentLoaded\', function() {
4352
const dropcapDiv = document.querySelector(\'.article-dropcap\');

app/data/global_rules.php

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,12 @@
6060
'lgpd',
6161
'push',
6262
'sw.js',
63-
'stats.js'
63+
'stats.js',
64+
'piano.io',
65+
'onesignal.com',
66+
'getsitecontrol.com',
67+
'navdmp.com',
68+
'getblue.io',
69+
'smartocto.com'
6470
]
6571
];

app/inc/Rules.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ class Rules
3434
'cookies',
3535
'classAttrRemove',
3636
'customCode',
37-
'excludeGlobalRules'
37+
'excludeGlobalRules',
38+
'customStyle',
39+
'useSelenium'
3840
];
3941

4042

app/inc/URLAnalyzer.php

Lines changed: 86 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,21 @@
1010
* - Requisições HTTP com múltiplas tentativas
1111
* - Processamento de conteúdo baseado em regras específicas por domínio
1212
* - Suporte a Wayback Machine como fallback
13+
* - Suporte a extração via Selenium quando habilitado por domínio
1314
*/
1415

1516
require_once 'Rules.php';
1617
require_once 'Cache.php';
1718

1819
use Curl\Curl;
20+
use Facebook\WebDriver\Remote\DesiredCapabilities;
21+
use Facebook\WebDriver\Remote\RemoteWebDriver;
22+
use Facebook\WebDriver\Firefox\FirefoxOptions;
23+
use Facebook\WebDriver\Firefox\FirefoxProfile;
1924

2025
class URLAnalyzer
2126
{
27+
// Rest of the file content remains exactly the same
2228
/**
2329
* @var array Lista de User Agents disponíveis para requisições
2430
*/
@@ -125,31 +131,93 @@ public function analyze($url)
125131
throw new Exception($error);
126132
}
127133

128-
// 4. Tenta buscar conteúdo diretamente
129-
try {
130-
$content = $this->fetchContent($cleanUrl);
131-
if (!empty($content)) {
132-
$processedContent = $this->processContent($content, $host, $cleanUrl);
133-
$this->cache->set($cleanUrl, $processedContent);
134-
return $processedContent;
134+
// 4. Verifica se deve usar Selenium
135+
$domainRules = $this->getDomainRules($host);
136+
if (isset($domainRules['useSelenium']) && $domainRules['useSelenium'] === true) {
137+
try {
138+
$content = $this->fetchFromSelenium($cleanUrl);
139+
if (!empty($content)) {
140+
$processedContent = $this->processContent($content, $host, $cleanUrl);
141+
$this->cache->set($cleanUrl, $processedContent);
142+
return $processedContent;
143+
}
144+
} catch (Exception $e) {
145+
$this->logError($cleanUrl, "Selenium fetch error: " . $e->getMessage());
135146
}
136-
} catch (Exception $e) {
137-
$this->logError($cleanUrl, "Direct fetch error: " . $e->getMessage());
147+
} else {
148+
// 5. Tenta buscar conteúdo diretamente
149+
try {
150+
$content = $this->fetchContent($cleanUrl);
151+
if (!empty($content)) {
152+
$processedContent = $this->processContent($content, $host, $cleanUrl);
153+
$this->cache->set($cleanUrl, $processedContent);
154+
return $processedContent;
155+
}
156+
} catch (Exception $e) {
157+
$this->logError($cleanUrl, "Direct fetch error: " . $e->getMessage());
158+
}
159+
160+
// 6. Tenta buscar do Wayback Machine como fallback
161+
try {
162+
$content = $this->fetchFromWaybackMachine($cleanUrl);
163+
if (!empty($content)) {
164+
$processedContent = $this->processContent($content, $host, $cleanUrl);
165+
$this->cache->set($cleanUrl, $processedContent);
166+
return $processedContent;
167+
}
168+
} catch (Exception $e) {
169+
$this->logError($cleanUrl, "Wayback Machine error: " . $e->getMessage());
170+
}
171+
172+
throw new Exception("Não foi possível obter o conteúdo da URL");
138173
}
139174

140-
// 5. Tenta buscar do Wayback Machine como fallback
175+
176+
}
177+
178+
/**
179+
* Tenta obter o conteúdo da URL usando Selenium
180+
*
181+
* @param string $url URL para buscar
182+
* @return string|null Conteúdo HTML da página
183+
* @throws Exception Em caso de erro na requisição
184+
*/
185+
private function fetchFromSelenium($url)
186+
{
187+
$host = 'http://'.SELENIUM_HOST.'/wd/hub';
188+
189+
$profile = new FirefoxProfile();
190+
$profile->setPreference("permissions.default.image", 2);
191+
$profile->setPreference("javascript.enabled", true);
192+
193+
$options = new FirefoxOptions();
194+
$options->setProfile($profile);
195+
196+
$capabilities = DesiredCapabilities::firefox();
197+
$capabilities->setCapability(FirefoxOptions::CAPABILITY, $options);
198+
141199
try {
142-
$content = $this->fetchFromWaybackMachine($cleanUrl);
143-
if (!empty($content)) {
144-
$processedContent = $this->processContent($content, $host, $cleanUrl);
145-
$this->cache->set($cleanUrl, $processedContent);
146-
return $processedContent;
200+
$driver = RemoteWebDriver::create($host, $capabilities);
201+
$driver->manage()->timeouts()->pageLoadTimeout(10);
202+
$driver->manage()->timeouts()->setScriptTimeout(5);
203+
204+
$driver->get($url);
205+
206+
$htmlSource = $driver->executeScript("return document.documentElement.outerHTML;");
207+
208+
$driver->quit();
209+
210+
if (empty($htmlSource)) {
211+
throw new Exception("Selenium returned empty content");
147212
}
213+
214+
return $htmlSource;
148215
} catch (Exception $e) {
149-
$this->logError($cleanUrl, "Wayback Machine error: " . $e->getMessage());
216+
if (isset($driver)) {
217+
$driver->quit();
218+
}
219+
throw $e;
150220
}
151-
152-
throw new Exception("Não foi possível obter o conteúdo da URL");
153221
}
154222

155223
/**

docker-entrypoint.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@ if [ -n "${S3_ENDPOINT}" ]; then
8585
echo "S3_ENDPOINT=${S3_ENDPOINT}" >> /app/.env
8686
fi
8787

88+
# Configurações do Selenium
89+
if [ -n "${SELENIUM_HOST}" ]; then
90+
echo "SELENIUM_HOST=${SELENIUM_HOST}" >> /app/.env
91+
fi
92+
8893
log_success "Variáveis de ambiente configuradas"
8994

9095
# === Ajuste de Permissões ===

0 commit comments

Comments
 (0)