1- import re , requests
1+ import logging
2+ import re
3+ import requests
24from wiktionaryparser .utils import WordData , Definition , RelatedWord
35from bs4 import BeautifulSoup
46from itertools import zip_longest
2022 "coordinate terms" ,
2123]
2224
23- def is_subheading (child , parent ) :
25+ def is_subheading (child : str , parent : str ) -> bool :
2426 child_headings = child .split ("." )
2527 parent_headings = parent .split ("." )
2628 if len (child_headings ) <= len (parent_headings ):
@@ -30,60 +32,60 @@ def is_subheading(child, parent):
3032 return False
3133 return True
3234
33- class WiktionaryParser ( object ) :
34- def __init__ (self ):
35+ class WiktionaryParser :
36+ def __init__ (self ) -> None :
3537 self .url = "https://en.wiktionary.org/wiki/{}?printable=yes"
3638 self .soup = None
3739 self .session = requests .Session ()
3840 self .session .mount ("http://" , requests .adapters .HTTPAdapter (max_retries = 2 ))
3941 self .session .mount ("https://" , requests .adapters .HTTPAdapter (max_retries = 2 ))
40- self .language = 'english'
42+ self .language : str = 'english'
4143 self .current_word = None
42- self .PARTS_OF_SPEECH = copy (PARTS_OF_SPEECH )
43- self .RELATIONS = copy (RELATIONS )
44- self .INCLUDED_ITEMS = self .RELATIONS + self .PARTS_OF_SPEECH + ['etymology' , 'pronunciation' ]
44+ self .PARTS_OF_SPEECH : list [ str ] = copy (PARTS_OF_SPEECH )
45+ self .RELATIONS : list [ str ] = copy (RELATIONS )
46+ self .INCLUDED_ITEMS : list [ str ] = self .RELATIONS + self .PARTS_OF_SPEECH + ['etymology' , 'pronunciation' ]
4547
46- def include_part_of_speech (self , part_of_speech ):
48+ def include_part_of_speech (self , part_of_speech ) -> None :
4749 part_of_speech = part_of_speech .lower ()
4850 if part_of_speech not in self .PARTS_OF_SPEECH :
4951 self .PARTS_OF_SPEECH .append (part_of_speech )
5052 self .INCLUDED_ITEMS .append (part_of_speech )
5153
52- def exclude_part_of_speech (self , part_of_speech ):
54+ def exclude_part_of_speech (self , part_of_speech ) -> None :
5355 part_of_speech = part_of_speech .lower ()
5456 self .PARTS_OF_SPEECH .remove (part_of_speech )
5557 self .INCLUDED_ITEMS .remove (part_of_speech )
5658
57- def include_relation (self , relation ) :
59+ def include_relation (self , relation : str ) -> None :
5860 relation = relation .lower ()
5961 if relation not in self .RELATIONS :
6062 self .RELATIONS .append (relation )
6163 self .INCLUDED_ITEMS .append (relation )
6264
63- def exclude_relation (self , relation ):
65+ def exclude_relation (self , relation ) -> None :
6466 relation = relation .lower ()
6567 self .RELATIONS .remove (relation )
6668 self .INCLUDED_ITEMS .remove (relation )
6769
68- def set_default_language (self , language = None ):
70+ def set_default_language (self , language = None ) -> None :
6971 if language is not None :
7072 self .language = language .lower ()
7173
72- def get_default_language (self ):
74+ def get_default_language (self ) -> str :
7375 return self .language
7476
75- def clean_html (self ):
77+ def clean_html (self ) -> None :
7678 unwanted_classes = ['sister-wikipedia' , 'thumb' , 'reference' , 'cited-source' ]
7779 for tag in self .soup .find_all (True , {'class' : unwanted_classes }):
7880 tag .extract ()
7981
80- def remove_digits (self , string ) :
82+ def remove_digits (self , string : str ) -> str :
8183 return string .translate (str .maketrans ('' , '' , digits )).strip ()
8284
83- def count_digits (self , string ) :
85+ def count_digits (self , string : str ) -> int :
8486 return len (list (filter (str .isdigit , string )))
8587
86- def get_id_list (self , contents , content_type ) :
88+ def get_id_list (self , contents : list , content_type : str ) -> list [ tuple [ str , str , str ]] :
8789 if content_type == 'etymologies' :
8890 checklist = ['etymology' ]
8991 elif content_type == 'pronunciation' :
@@ -96,7 +98,7 @@ def get_id_list(self, contents, content_type):
9698 checklist = self .RELATIONS
9799 else :
98100 return None
99- id_list = []
101+ id_list : list [ tuple [ str , str , str ]] = []
100102 if len (contents ) == 0 :
101103 return [('1' , x .title (), x ) for x in checklist if self .soup .find ('span' , {'id' : x .title ()})]
102104 for content_tag in contents :
@@ -107,7 +109,7 @@ def get_id_list(self, contents, content_type):
107109 id_list .append ((content_index , content_id , text_to_check ))
108110 return id_list
109111
110- def get_word_data (self , language ) :
112+ def get_word_data (self , language : str ) -> list :
111113 contents = self .soup .find_all ('span' , {'class' : 'toctext' })
112114 word_contents = []
113115 start_index = None
@@ -139,7 +141,7 @@ def get_word_data(self, language):
139141 json_obj_list = self .map_to_object (word_data )
140142 return json_obj_list
141143
142- def parse_pronunciations (self , word_contents ):
144+ def parse_pronunciations (self , word_contents ) -> list :
143145 pronunciation_id_list = self .get_id_list (word_contents , 'pronunciation' )
144146 pronunciation_list = []
145147 audio_links = []
@@ -168,7 +170,7 @@ def parse_pronunciations(self, word_contents):
168170 pronunciation_list .append ((pronunciation_index , pronunciation_text , audio_links ))
169171 return pronunciation_list
170172
171- def parse_definitions (self , word_contents ):
173+ def parse_definitions (self , word_contents ) -> list :
172174 definition_id_list = self .get_id_list (word_contents , 'definitions' )
173175 definition_list = []
174176 definition_tag = None
@@ -191,7 +193,7 @@ def parse_definitions(self, word_contents):
191193 definition_list .append ((def_index , definition_text , def_type ))
192194 return definition_list
193195
194- def parse_examples (self , word_contents ):
196+ def parse_examples (self , word_contents ) -> list :
195197 definition_id_list = self .get_id_list (word_contents , 'definitions' )
196198 example_list = []
197199 for def_index , def_id , def_type in definition_id_list :
@@ -212,7 +214,7 @@ def parse_examples(self, word_contents):
212214 table = table .find_next_sibling ()
213215 return example_list
214216
215- def parse_etymologies (self , word_contents ):
217+ def parse_etymologies (self , word_contents ) -> list :
216218 etymology_id_list = self .get_id_list (word_contents , 'etymologies' )
217219 etymology_list = []
218220 etymology_tag = None
@@ -231,7 +233,7 @@ def parse_etymologies(self, word_contents):
231233 etymology_list .append ((etymology_index , etymology_text ))
232234 return etymology_list
233235
234- def parse_related_words (self , word_contents ):
236+ def parse_related_words (self , word_contents ) -> list :
235237 relation_id_list = self .get_id_list (word_contents , 'related' )
236238 related_words_list = []
237239 for related_index , related_id , relation_type in relation_id_list :
@@ -246,7 +248,7 @@ def parse_related_words(self, word_contents):
246248 related_words_list .append ((related_index , words , relation_type ))
247249 return related_words_list
248250
249- def map_to_object (self , word_data ) :
251+ def map_to_object (self , word_data : dict ) -> list :
250252 json_obj_list = []
251253 if not word_data ['etymologies' ]:
252254 word_data ['etymologies' ] = [('' , '' )]
@@ -276,7 +278,7 @@ def map_to_object(self, word_data):
276278 json_obj_list .append (data_obj .to_json ())
277279 return json_obj_list
278280
279- def fetch (self , word , language = None , old_id = None ):
281+ def fetch (self , word : str , language : str | None = None , old_id : int | None = None ) -> list :
280282 language = self .language if not language else language
281283 response = self .session .get (self .url .format (word ), params = {'oldid' : old_id })
282284 self .soup = BeautifulSoup (response .text .replace ('>\n <' , '><' ), 'html.parser' )
0 commit comments