@@ -37,10 +37,16 @@ private function _cleanText($text)
3737 return preg_replace ('/\s+/ ' , ' ' , trim ($ text ));
3838 }
3939
40- public function searchWord ($ word )
40+ private function _cleanWord ($ word )
4141 {
42- $ htmlData = $ this ->_fetchHtml ($ word );
42+ // Remove non-alphanumeric characters except spaces
43+ $ cleanWord = preg_replace ('/[^a-zA-Z0-9\s]/ ' , '' , $ word );
44+ // Replace multiple spaces with a single space
45+ return preg_replace ('/\s+/ ' , ' ' , strtolower (trim ($ cleanWord )));
46+ }
4347
48+ private function _parserV1 ($ htmlData , $ word )
49+ {
4450 $ doc = new DOMDocument ();
4551 libxml_use_internal_errors (true );
4652 $ doc ->loadHTML ($ htmlData );
@@ -79,11 +85,91 @@ public function searchWord($word)
7985 }
8086
8187 $ dataResponse [$ i ] = [
88+ 'word ' => $ word ,
8289 'lema ' => $ lema ,
8390 'arti ' => $ arti ,
8491 'tesaurusLink ' => $ tesaurusLink ,
8592 ];
8693 }
94+ }
95+
96+ private function _parserV2 ($ htmlData , $ word )
97+ {
98+ $ doc = new DOMDocument ();
99+ libxml_use_internal_errors (true );
100+ $ doc ->loadHTML ($ htmlData );
101+ libxml_clear_errors ();
102+
103+ $ xpath = new DOMXPath ($ doc );
104+ $ dataResponse = [];
105+
106+ $ contentDiv = $ xpath ->query ("//div[contains(@class, 'container body-content')] " )->item (0 );
107+ if (!$ contentDiv ) {
108+ return false ;
109+ }
110+
111+ // Mengambil semua elemen h2 dalam div body-content
112+ $ h2Elements = $ xpath ->query (".//h2[contains(@style, 'margin-bottom:3px')] " , $ contentDiv );
113+ foreach ($ h2Elements as $ i => $ h2Element ) {
114+ // Mengambil lema dari link a di dalam span rootword
115+ $ lemaLink = $ xpath ->query (".//span[contains(@class, 'rootword')]/a " , $ h2Element )->item (0 );
116+ $ lema = '' ;
117+ if ($ lemaLink ) {
118+ $ lema = $ this ->_cleanText ($ lemaLink ->nodeValue );
119+ }
120+
121+ // Mengambil link Tesaurus
122+ $ tesaurusLink = '' ;
123+ $ tesaurusAnchor = $ xpath ->query (".//p/a[contains(@href, 'tematis/lema')] " , $ h2Element )->item (0 );
124+ if ($ tesaurusAnchor ) {
125+ $ tesaurusLink = $ tesaurusAnchor ->getAttribute ('href ' );
126+ } else {
127+ $ tesaurusLink = "http://tesaurus.kemdikbud.go.id/tematis/lema/ " .$ word ;
128+ }
129+
130+ // Mengambil deskripsi/arti dari ul/li setelah h2
131+ $ ulElement = $ xpath ->query ("following-sibling::ul[@class='adjusted-par'][1] " , $ h2Element )->item (0 );
132+ $ arti = [];
133+ if ($ ulElement ) {
134+ $ listItems = $ xpath ->query (".//li " , $ ulElement );
135+ foreach ($ listItems as $ j => $ listItem ) {
136+ $ deskripsi = $ this ->_cleanText ($ listItem ->nodeValue );
137+ $ arti [] = ['deskripsi ' => $ deskripsi ];
138+ }
139+ }
140+
141+ // Menyimpan data dalam $dataResponse
142+ if (!empty ($ lema ) && !empty ($ arti )) {
143+ $ dataResponse [] = [
144+ 'word ' => $ word ,
145+ 'lema ' => $ lema . " » " . $ word ,
146+ 'arti ' => $ arti ,
147+ 'tesaurusLink ' => $ tesaurusLink ,
148+ ];
149+ }
150+ }
151+
152+ return $ dataResponse ;
153+ }
154+
155+ public function searchWord ($ word )
156+ {
157+ // Clean the word
158+ $ cleanWord = $ this ->_cleanWord ($ word );
159+
160+ $ htmlData = $ this ->_fetchHtml ($ word );
161+
162+ $ dataResponse = [];
163+
164+ $ _parserV1 = $ this ->_parserV1 ($ htmlData , $ cleanWord );
165+ if (count ($ _parserV1 )){
166+ $ dataResponse = $ _parserV1 ;
167+ } else {
168+ $ _parserV2 = $ this ->_parserV2 ($ htmlData , $ cleanWord );
169+ if (count ($ _parserV2 )){
170+ $ dataResponse = $ _parserV2 ;
171+ }
172+ }
87173
88174 return count ($ dataResponse ) ? $ dataResponse : false ;
89175 }
0 commit comments