Skip to content

Commit bf85f0b

Browse files
authored
add: User Agent to avoid access restrictions from online KBBI
1 parent 4c03a99 commit bf85f0b

File tree

1 file changed

+129
-33
lines changed

1 file changed

+129
-33
lines changed

KBBIModel.php

Lines changed: 129 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,73 @@
88

99
class KBBIModel extends Model
1010
{
11-
protected $table = 'kbbi_entries';
11+
private function _user_agent(){
12+
$userAgents = [
13+
// Chrome (Desktop)
14+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
15+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
16+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
17+
18+
// Chrome (Mobile)
19+
"Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
20+
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
21+
22+
// Firefox (Desktop)
23+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:114.0) Gecko/20100101 Firefox/114.0",
24+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:114.0) Gecko/20100101 Firefox/114.0",
25+
"Mozilla/5.0 (X11; Linux x86_64; rv:114.0) Gecko/20100101 Firefox/114.0",
26+
27+
// Firefox (Mobile)
28+
"Mozilla/5.0 (Android 10; Mobile; rv:114.0) Gecko/114.0 Firefox/114.0",
29+
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) FxiOS/114.0 Mobile/15E148 Safari/604.1",
30+
31+
// Edge (Desktop)
32+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.0.0",
33+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.0.0",
34+
35+
// Safari (Desktop)
36+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15",
37+
38+
// Safari (Mobile)
39+
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1",
40+
"Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1",
41+
42+
// Opera (Desktop)
43+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 OPR/100.0.0.0",
44+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 OPR/100.0.0.0",
45+
46+
// Opera (Mobile)
47+
"Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36 OPR/74.0.0.0",
48+
49+
// Samsung Internet
50+
"Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36 SamsungBrowser/18.0",
51+
52+
// Internet Explorer
53+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; Trident/7.0; rv:11.0) like Gecko",
54+
55+
// UC Browser (Mobile)
56+
"Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/114.0.0.0 Mobile Safari/537.36 UCBrowser/13.4.0.1306",
57+
58+
// Brave (Desktop)
59+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Brave/114.0.0.0",
60+
61+
// New User Agents Added
62+
"Mozilla/5.0 (Linux; Android 13; SM-S911B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Mobile Safari/537.36",
63+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
64+
"Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/605.1.15",
65+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
66+
"Mozilla/5.0 (Android 11; Mobile; rv:117.0) Gecko/117.0 Firefox/117.0",
67+
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1",
68+
];
69+
70+
$userAgent = $userAgents[array_rand($userAgents)];
71+
72+
return $userAgent;
73+
}
1274

1375
private function _fetchHtml($word)
1476
{
77+
$userAgents = $this->_user_agent();
1578
$encodedWord = rawurlencode($word);
1679
$url = "https://kbbi.kemdikbud.go.id/entri/" . $encodedWord;
1780
$ch = curl_init($url);
@@ -26,6 +89,7 @@ private function _fetchHtml($word)
2689
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
2790
curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
2891
curl_setopt($ch, CURLOPT_MAXREDIRS, 3);
92+
curl_setopt($ch, CURLOPT_USERAGENT, $userAgents);
2993

3094
$response = curl_exec($ch);
3195

@@ -40,6 +104,7 @@ private function _fetchHtml($word)
40104

41105
private function _request__KBBI_API_Zhirrr($word)
42106
{
107+
$userAgents = $this->_user_agent();
43108
$encodedWord = rawurlencode($word);
44109
$url = "https://kbbi-api-zhirrr.vercel.app/api/kbbi?text=" . $encodedWord;
45110
$ch = curl_init($url);
@@ -54,6 +119,7 @@ private function _request__KBBI_API_Zhirrr($word)
54119
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
55120
curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
56121
curl_setopt($ch, CURLOPT_MAXREDIRS, 3);
122+
curl_setopt($ch, CURLOPT_USERAGENT, $userAgents);
57123

58124
$response = curl_exec($ch);
59125

@@ -141,33 +207,49 @@ private function _cleanWord($word)
141207

142208
private function _parserV2($htmlData, $word)
143209
{
144-
$doc = Dom\HTMLDocument::createFromString($htmlData, LIBXML_NOERROR);
210+
$doc = new DOMDocument();
211+
libxml_use_internal_errors(true);
212+
$doc->loadHTML($htmlData);
213+
libxml_clear_errors();
214+
215+
$xpath = new DOMXPath($doc);
145216
$dataResponse = [];
146-
147-
$contentDiv = $doc->querySelector("div.container.body-content");
217+
218+
$contentDiv = $xpath->query("//div[contains(@class, 'container body-content')]")->item(0);
148219
if (!$contentDiv) {
149220
return false;
150221
}
151-
222+
152223
// Mengambil semua elemen h2 dalam div body-content
153-
foreach ($contentDiv->querySelectorAll("h2[style*='margin-bottom:3px']") as $h2Element) {
224+
$h2Elements = $xpath->query(".//h2[contains(@style, 'margin-bottom:3px')]", $contentDiv);
225+
foreach ($h2Elements as $i => $h2Element) {
154226
// Mengambil lema dari link a di dalam span rootword
155-
$lemaLink = $h2Element->querySelector("span.rootword > a");
156-
$lema = $lemaLink ? $this->_cleanText($lemaLink->textContent) : '';
157-
227+
$lemaLink = $xpath->query(".//span[contains(@class, 'rootword')]/a", $h2Element)->item(0);
228+
$lema = '';
229+
if ($lemaLink) {
230+
$lema = $this->_cleanText($lemaLink->nodeValue);
231+
}
232+
158233
// Mengambil link Tesaurus
159-
$tesaurusLink = $h2Element->querySelector("p > a[href*='tematis/lema']")?->getAttribute('href') ?? "http://tesaurus.kemdikbud.go.id/tematis/lema/" . $word;
160-
234+
$tesaurusLink = '';
235+
$tesaurusAnchor = $xpath->query(".//p/a[contains(@href, 'tematis/lema')]", $h2Element)->item(0);
236+
if ($tesaurusAnchor) {
237+
$tesaurusLink = $tesaurusAnchor->getAttribute('href');
238+
} else {
239+
$tesaurusLink = "http://tesaurus.kemdikbud.go.id/tematis/lema/".$word;
240+
}
241+
161242
// Mengambil deskripsi/arti dari ul/li setelah h2
162-
$ulElement = $h2Element->nextElementSibling?->classList->contains('adjusted-par') ? $h2Element->nextElementSibling : null;
243+
$ulElement = $xpath->query("following-sibling::ul[@class='adjusted-par'][1]", $h2Element)->item(0);
163244
$arti = [];
164245
if ($ulElement) {
165-
foreach ($ulElement->querySelectorAll("li") as $listItem) {
166-
$deskripsi = $this->_cleanText($listItem->textContent);
246+
$listItems = $xpath->query(".//li", $ulElement);
247+
foreach ($listItems as $j => $listItem) {
248+
$deskripsi = $this->_cleanText($listItem->nodeValue);
167249
$arti[] = ['deskripsi' => $deskripsi];
168250
}
169251
}
170-
252+
171253
// Menyimpan data dalam $dataResponse
172254
if (!empty($lema) && !empty($arti)) {
173255
$dataResponse[] = [
@@ -178,41 +260,56 @@ private function _parserV2($htmlData, $word)
178260
];
179261
}
180262
}
181-
263+
182264
return count($dataResponse) ? $dataResponse : [];
183265
}
184-
266+
185267
private function _parserV3($htmlData, $word)
186268
{
187-
$doc = Dom\HTMLDocument::createFromString($htmlData, LIBXML_NOERROR);
269+
$doc = new DOMDocument();
270+
libxml_use_internal_errors(true);
271+
$doc->loadHTML($htmlData);
272+
libxml_clear_errors();
273+
274+
$xpath = new DOMXPath($doc);
188275
$dataResponse = [];
189-
276+
190277
// Mengambil semua elemen h2 yang memiliki style 'margin-bottom:3px'
191-
foreach ($doc->querySelectorAll("h2[style*='margin-bottom:3px']") as $h2Element) {
278+
$h2Elements = $xpath->query("//h2[contains(@style, 'margin-bottom:3px')]");
279+
foreach ($h2Elements as $h2Element) {
280+
// Mengambil teks dari elemen h2
192281
$lema = $this->_cleanText($h2Element->textContent);
193-
282+
194283
// Mengambil link Tesaurus dari elemen <p><a>
195-
$tesaurusLink = $h2Element->nextElementSibling?->querySelector("a[href*='tematis/lema']")?->getAttribute('href') ?? "http://tesaurus.kemdikbud.go.id/tematis/lema/" . $lema;
196-
284+
$tesaurusLink = '';
285+
$tesaurusAnchor = $xpath->query("following-sibling::p[1]/a[contains(@href, 'tematis/lema')]", $h2Element)->item(0);
286+
if ($tesaurusAnchor) {
287+
$tesaurusLink = $tesaurusAnchor->getAttribute('href');
288+
} else {
289+
$tesaurusLink = "http://tesaurus.kemdikbud.go.id/tematis/lema/" . $lema;
290+
}
291+
197292
// Mengambil deskripsi/arti dari ol/li setelah h2
198293
$arti = [];
199-
$olElement = $h2Element->nextElementSibling?->tagName === 'OL' ? $h2Element->nextElementSibling : null;
294+
$olElement = $xpath->query("following-sibling::ol[1]", $h2Element)->item(0);
200295
if ($olElement) {
201-
foreach ($olElement->querySelectorAll("li") as $listItem) {
202-
$deskripsi = $this->_cleanText($listItem->textContent);
296+
$listItems = $xpath->query(".//li", $olElement);
297+
foreach ($listItems as $listItem) {
298+
$deskripsi = $this->_cleanText($listItem->nodeValue);
203299
$arti[] = ['deskripsi' => $deskripsi];
204300
}
205301
}
206-
302+
207303
// Mengambil deskripsi/arti dari ul/li setelah h2
208-
$ulElement = $h2Element->nextElementSibling?->classList->contains('adjusted-par') ? $h2Element->nextElementSibling : null;
304+
$ulElement = $xpath->query("following-sibling::ul[@class='adjusted-par'][1]", $h2Element)->item(0);
209305
if ($ulElement) {
210-
foreach ($ulElement->querySelectorAll("li") as $listItem) {
211-
$deskripsi = $this->_cleanText($listItem->textContent);
306+
$listItems = $xpath->query(".//li", $ulElement);
307+
foreach ($listItems as $listItem) {
308+
$deskripsi = $this->_cleanText($listItem->nodeValue);
212309
$arti[] = ['deskripsi' => $deskripsi];
213310
}
214311
}
215-
312+
216313
// Menyimpan data dalam $dataResponse
217314
if (!empty($lema) && !empty($arti)) {
218315
$dataResponse[] = [
@@ -223,11 +320,10 @@ private function _parserV3($htmlData, $word)
223320
];
224321
}
225322
}
226-
323+
227324
return count($dataResponse) ? $dataResponse : [];
228325
}
229326

230-
231327
private function _KBBI_official($word)
232328
{
233329
// Clean the word

0 commit comments

Comments
 (0)