88
99class KBBIModel extends Model
1010{
11- protected $ table = 'kbbi_entries ' ;
11+ private function _user_agent (){
12+ $ userAgents = [
13+ // Chrome (Desktop)
14+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 " ,
15+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 " ,
16+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 " ,
17+
18+ // Chrome (Mobile)
19+ "Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36 " ,
20+ "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1 " ,
21+
22+ // Firefox (Desktop)
23+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:114.0) Gecko/20100101 Firefox/114.0 " ,
24+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:114.0) Gecko/20100101 Firefox/114.0 " ,
25+ "Mozilla/5.0 (X11; Linux x86_64; rv:114.0) Gecko/20100101 Firefox/114.0 " ,
26+
27+ // Firefox (Mobile)
28+ "Mozilla/5.0 (Android 10; Mobile; rv:114.0) Gecko/114.0 Firefox/114.0 " ,
29+ "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) FxiOS/114.0 Mobile/15E148 Safari/604.1 " ,
30+
31+ // Edge (Desktop)
32+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.0.0 " ,
33+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.0.0 " ,
34+
35+ // Safari (Desktop)
36+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15 " ,
37+
38+ // Safari (Mobile)
39+ "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1 " ,
40+ "Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1 " ,
41+
42+ // Opera (Desktop)
43+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 OPR/100.0.0.0 " ,
44+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 OPR/100.0.0.0 " ,
45+
46+ // Opera (Mobile)
47+ "Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36 OPR/74.0.0.0 " ,
48+
49+ // Samsung Internet
50+ "Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36 SamsungBrowser/18.0 " ,
51+
52+ // Internet Explorer
53+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; Trident/7.0; rv:11.0) like Gecko " ,
54+
55+ // UC Browser (Mobile)
56+ "Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/114.0.0.0 Mobile Safari/537.36 UCBrowser/13.4.0.1306 " ,
57+
58+ // Brave (Desktop)
59+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Brave/114.0.0.0 " ,
60+
61+ // New User Agents Added
62+ "Mozilla/5.0 (Linux; Android 13; SM-S911B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Mobile Safari/537.36 " ,
63+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 " ,
64+ "Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/605.1.15 " ,
65+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 " ,
66+ "Mozilla/5.0 (Android 11; Mobile; rv:117.0) Gecko/117.0 Firefox/117.0 " ,
67+ "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1 " ,
68+ ];
69+
70+ $ userAgent = $ userAgents [array_rand ($ userAgents )];
71+
72+ return $ userAgent ;
73+ }
1274
1375 private function _fetchHtml ($ word )
1476 {
77+ $ userAgents = $ this ->_user_agent ();
1578 $ encodedWord = rawurlencode ($ word );
1679 $ url = "https://kbbi.kemdikbud.go.id/entri/ " . $ encodedWord ;
1780 $ ch = curl_init ($ url );
@@ -26,6 +89,7 @@ private function _fetchHtml($word)
2689 curl_setopt ($ ch , CURLOPT_CONNECTTIMEOUT , 10 );
2790 curl_setopt ($ ch , CURLOPT_ENCODING , 'gzip,deflate ' );
2891 curl_setopt ($ ch , CURLOPT_MAXREDIRS , 3 );
92+ curl_setopt ($ ch , CURLOPT_USERAGENT , $ userAgents );
2993
3094 $ response = curl_exec ($ ch );
3195
@@ -40,6 +104,7 @@ private function _fetchHtml($word)
40104
41105 private function _request__KBBI_API_Zhirrr ($ word )
42106 {
107+ $ userAgents = $ this ->_user_agent ();
43108 $ encodedWord = rawurlencode ($ word );
44109 $ url = "https://kbbi-api-zhirrr.vercel.app/api/kbbi?text= " . $ encodedWord ;
45110 $ ch = curl_init ($ url );
@@ -54,6 +119,7 @@ private function _request__KBBI_API_Zhirrr($word)
54119 curl_setopt ($ ch , CURLOPT_CONNECTTIMEOUT , 10 );
55120 curl_setopt ($ ch , CURLOPT_ENCODING , 'gzip,deflate ' );
56121 curl_setopt ($ ch , CURLOPT_MAXREDIRS , 3 );
122+ curl_setopt ($ ch , CURLOPT_USERAGENT , $ userAgents );
57123
58124 $ response = curl_exec ($ ch );
59125
@@ -141,33 +207,49 @@ private function _cleanWord($word)
141207
142208 private function _parserV2 ($ htmlData , $ word )
143209 {
144- $ doc = Dom \HTMLDocument::createFromString ($ htmlData , LIBXML_NOERROR );
210+ $ doc = new DOMDocument ();
211+ libxml_use_internal_errors (true );
212+ $ doc ->loadHTML ($ htmlData );
213+ libxml_clear_errors ();
214+
215+ $ xpath = new DOMXPath ($ doc );
145216 $ dataResponse = [];
146-
147- $ contentDiv = $ doc -> querySelector ( " div. container. body-content " );
217+
218+ $ contentDiv = $ xpath -> query ( " // div[contains(@class, ' container body-content')] " )-> item ( 0 );
148219 if (!$ contentDiv ) {
149220 return false ;
150221 }
151-
222+
152223 // Mengambil semua elemen h2 dalam div body-content
153- foreach ($ contentDiv ->querySelectorAll ("h2[style*='margin-bottom:3px'] " ) as $ h2Element ) {
224+ $ h2Elements = $ xpath ->query (".//h2[contains(@style, 'margin-bottom:3px')] " , $ contentDiv );
225+ foreach ($ h2Elements as $ i => $ h2Element ) {
154226 // Mengambil lema dari link a di dalam span rootword
155- $ lemaLink = $ h2Element ->querySelector ("span.rootword > a " );
156- $ lema = $ lemaLink ? $ this ->_cleanText ($ lemaLink ->textContent ) : '' ;
157-
227+ $ lemaLink = $ xpath ->query (".//span[contains(@class, 'rootword')]/a " , $ h2Element )->item (0 );
228+ $ lema = '' ;
229+ if ($ lemaLink ) {
230+ $ lema = $ this ->_cleanText ($ lemaLink ->nodeValue );
231+ }
232+
158233 // Mengambil link Tesaurus
159- $ tesaurusLink = $ h2Element ->querySelector ("p > a[href*='tematis/lema'] " )?->getAttribute('href ' ) ?? "http://tesaurus.kemdikbud.go.id/tematis/lema/ " . $ word ;
160-
234+ $ tesaurusLink = '' ;
235+ $ tesaurusAnchor = $ xpath ->query (".//p/a[contains(@href, 'tematis/lema')] " , $ h2Element )->item (0 );
236+ if ($ tesaurusAnchor ) {
237+ $ tesaurusLink = $ tesaurusAnchor ->getAttribute ('href ' );
238+ } else {
239+ $ tesaurusLink = "http://tesaurus.kemdikbud.go.id/tematis/lema/ " .$ word ;
240+ }
241+
161242 // Mengambil deskripsi/arti dari ul/li setelah h2
162- $ ulElement = $ h2Element -> nextElementSibling ?->classList-> contains ( 'adjusted-par ' ) ? $ h2Element-> nextElementSibling : null ;
243+ $ ulElement = $ xpath -> query ( " following-sibling::ul[@class= 'adjusted-par'][1] " , $ h2Element)-> item ( 0 ) ;
163244 $ arti = [];
164245 if ($ ulElement ) {
165- foreach ($ ulElement ->querySelectorAll ("li " ) as $ listItem ) {
166- $ deskripsi = $ this ->_cleanText ($ listItem ->textContent );
246+ $ listItems = $ xpath ->query (".//li " , $ ulElement );
247+ foreach ($ listItems as $ j => $ listItem ) {
248+ $ deskripsi = $ this ->_cleanText ($ listItem ->nodeValue );
167249 $ arti [] = ['deskripsi ' => $ deskripsi ];
168250 }
169251 }
170-
252+
171253 // Menyimpan data dalam $dataResponse
172254 if (!empty ($ lema ) && !empty ($ arti )) {
173255 $ dataResponse [] = [
@@ -178,41 +260,56 @@ private function _parserV2($htmlData, $word)
178260 ];
179261 }
180262 }
181-
263+
182264 return count ($ dataResponse ) ? $ dataResponse : [];
183265 }
184-
266+
185267 private function _parserV3 ($ htmlData , $ word )
186268 {
187- $ doc = Dom \HTMLDocument::createFromString ($ htmlData , LIBXML_NOERROR );
269+ $ doc = new DOMDocument ();
270+ libxml_use_internal_errors (true );
271+ $ doc ->loadHTML ($ htmlData );
272+ libxml_clear_errors ();
273+
274+ $ xpath = new DOMXPath ($ doc );
188275 $ dataResponse = [];
189-
276+
190277 // Mengambil semua elemen h2 yang memiliki style 'margin-bottom:3px'
191- foreach ($ doc ->querySelectorAll ("h2[style*='margin-bottom:3px'] " ) as $ h2Element ) {
278+ $ h2Elements = $ xpath ->query ("//h2[contains(@style, 'margin-bottom:3px')] " );
279+ foreach ($ h2Elements as $ h2Element ) {
280+ // Mengambil teks dari elemen h2
192281 $ lema = $ this ->_cleanText ($ h2Element ->textContent );
193-
282+
194283 // Mengambil link Tesaurus dari elemen <p><a>
195- $ tesaurusLink = $ h2Element ->nextElementSibling ?->querySelector("a[href*='tematis/lema'] " )?->getAttribute('href ' ) ?? "http://tesaurus.kemdikbud.go.id/tematis/lema/ " . $ lema ;
196-
284+ $ tesaurusLink = '' ;
285+ $ tesaurusAnchor = $ xpath ->query ("following-sibling::p[1]/a[contains(@href, 'tematis/lema')] " , $ h2Element )->item (0 );
286+ if ($ tesaurusAnchor ) {
287+ $ tesaurusLink = $ tesaurusAnchor ->getAttribute ('href ' );
288+ } else {
289+ $ tesaurusLink = "http://tesaurus.kemdikbud.go.id/tematis/lema/ " . $ lema ;
290+ }
291+
197292 // Mengambil deskripsi/arti dari ol/li setelah h2
198293 $ arti = [];
199- $ olElement = $ h2Element -> nextElementSibling ?->tagName === ' OL ' ? $ h2Element-> nextElementSibling : null ;
294+ $ olElement = $ xpath -> query ( " following-sibling::ol[1] " , $ h2Element)-> item ( 0 ) ;
200295 if ($ olElement ) {
201- foreach ($ olElement ->querySelectorAll ("li " ) as $ listItem ) {
202- $ deskripsi = $ this ->_cleanText ($ listItem ->textContent );
296+ $ listItems = $ xpath ->query (".//li " , $ olElement );
297+ foreach ($ listItems as $ listItem ) {
298+ $ deskripsi = $ this ->_cleanText ($ listItem ->nodeValue );
203299 $ arti [] = ['deskripsi ' => $ deskripsi ];
204300 }
205301 }
206-
302+
207303 // Mengambil deskripsi/arti dari ul/li setelah h2
208- $ ulElement = $ h2Element -> nextElementSibling ?->classList-> contains ( 'adjusted-par ' ) ? $ h2Element-> nextElementSibling : null ;
304+ $ ulElement = $ xpath -> query ( " following-sibling::ul[@class= 'adjusted-par'][1] " , $ h2Element)-> item ( 0 ) ;
209305 if ($ ulElement ) {
210- foreach ($ ulElement ->querySelectorAll ("li " ) as $ listItem ) {
211- $ deskripsi = $ this ->_cleanText ($ listItem ->textContent );
306+ $ listItems = $ xpath ->query (".//li " , $ ulElement );
307+ foreach ($ listItems as $ listItem ) {
308+ $ deskripsi = $ this ->_cleanText ($ listItem ->nodeValue );
212309 $ arti [] = ['deskripsi ' => $ deskripsi ];
213310 }
214311 }
215-
312+
216313 // Menyimpan data dalam $dataResponse
217314 if (!empty ($ lema ) && !empty ($ arti )) {
218315 $ dataResponse [] = [
@@ -223,11 +320,10 @@ private function _parserV3($htmlData, $word)
223320 ];
224321 }
225322 }
226-
323+
227324 return count ($ dataResponse ) ? $ dataResponse : [];
228325 }
229326
230-
231327 private function _KBBI_official ($ word )
232328 {
233329 // Clean the word
0 commit comments