1- # TODO: Crawler
1+ import re
2+ from bs4 import BeautifulSoup
3+ from typing import List , Dict , Optional
4+ from Core .core import BasePuzzleCrawler , PuzzleItem
5+
6+ class BattleshipCrawler (BasePuzzleCrawler ):
7+
8+ def parse_index (self , html_content : str ) -> List [Dict ]:
9+ soup = BeautifulSoup (html_content , 'html.parser' )
10+ container = soup .find ('div' , id = 'index-1' )
11+
12+ if not container :
13+ self .logger .warning ("Index container #index-1 not found." )
14+ return []
15+
16+ results = []
17+ for link in container .find_all ('a' ):
18+ href = link .get ('href' )
19+ text = link .get_text (strip = True )
20+
21+ if href and text :
22+ # Custom logic to classify links as you did before
23+ link_type = 'class_sv' if 'sv' in link .get ('class' , []) else 'other'
24+ results .append ({
25+ 'href' : self .config .base_url + href if not href .startswith ('http' ) else href ,
26+ 'text' : text ,
27+ 'type' : link_type
28+ })
29+
30+ # You can add your `filter_and_classify_results` logic here if needed
31+ return results
32+
33+ def parse_puzzle_detail (self , html_content : str , metadata : Dict ) -> Optional [PuzzleItem ]:
34+ text = metadata .get ('text' , 'unknown' )
35+ link_type = metadata .get ('type' )
36+
37+ # Define Regex patterns based on type
38+ if link_type == "class_sv" :
39+ patterns = {
40+ 'rows' : r"(?<=\[rlabels\]\n)(.*?)(?=\[clabels\])" ,
41+ 'cols' : r"(?<=\[clabels\]\n)(.*?)(?=\[problem\])" ,
42+ 'areas' : r"(?<=\[problem\]\n)(.*?)(?=\[solution\])" ,
43+ 'sol' : r"(?<=\[solution\]\n)(.*?)(?=\[moves\])"
44+ }
45+ else :
46+ patterns = {
47+ 'rows' : r"(?<=\[rlabels\]\n)(.*?)(?=\[clabels\])" ,
48+ 'cols' : r"(?<=\[clabels\]\n)(.*?)(?=\[problem\])" ,
49+ 'areas' : r"(?<=\[problem\]\n)(.*?)(?=\[solution\])" ,
50+ 'sol' : r"(?<=\[solution\]\n)(.*?)(?=\[end\])"
51+ }
52+
53+ try :
54+ cols_match = re .search (patterns ['cols' ], html_content , re .DOTALL )
55+ rows_match = re .search (patterns ['rows' ], html_content , re .DOTALL )
56+ areas_match = re .search (patterns ['areas' ], html_content , re .DOTALL )
57+ sol_match = re .search (patterns ['sol' ], html_content , re .DOTALL )
58+
59+ if not all ([cols_match , rows_match , areas_match , sol_match ]):
60+ try :
61+ if link_type == "class_sv" :
62+ patterns = {
63+ 'cols' : r"(?<=\[clabels\]\n)(.*?)(?=\[rlabels\])" ,
64+ 'rows' : r"(?<=\[rlabels\]\n)(.*?)(?=\[problem\])" ,
65+ 'areas' : r"(?<=\[problem\]\n)(.*?)(?=\[solution\])" ,
66+ 'sol' : r"(?<=\[solution\]\n)(.*?)(?=\[moves\])"
67+ }
68+ else :
69+ patterns = {
70+ 'cols' : r"(?<=\[clabels\]\n)(.*?)(?=\[rlabels\])" ,
71+ 'rows' : r"(?<=\[rlabels\]\n)(.*?)(?=\[problem\])" ,
72+ 'areas' : r"(?<=\[problem\]\n)(.*?)(?=\[solution\])" ,
73+ 'sol' : r"(?<=\[solution\]\n)(.*?)(?=\[end\])"
74+ }
75+ cols_match = re .search (patterns ['cols' ], html_content , re .DOTALL )
76+ rows_match = re .search (patterns ['rows' ], html_content , re .DOTALL )
77+ areas_match = re .search (patterns ['areas' ], html_content , re .DOTALL )
78+ sol_match = re .search (patterns ['sol' ], html_content , re .DOTALL )
79+ except Exception as e :
80+ try :
81+ if link_type == "class_sv" :
82+ patterns = {
83+ 'cols' : r"(?<=\[clabels\]\n)(.*?)(?=\[rlabels\])" ,
84+ 'rows' : r"(?<=\[rlabels\]\n)(.*?)(?=\[solution\])" ,
85+ # 'areas': r"(?<=\[problem\]\n)(.*?)(?=\[solution\])",
86+ 'sol' : r"(?<=\[solution\]\n)(.*?)(?=\[moves\])"
87+ }
88+ else :
89+ patterns = {
90+ 'cols' : r"(?<=\[clabels\]\n)(.*?)(?=\[rlabels\])" ,
91+ 'rows' : r"(?<=\[rlabels\]\n)(.*?)(?=\[solution\])" ,
92+ # 'areas': r"(?<=\[problem\]\n)(.*?)(?=\[solution\])",
93+ 'sol' : r"(?<=\[solution\]\n)(.*?)(?=\[end\])"
94+ }
95+ cols_match = re .search (patterns ['cols' ], html_content , re .DOTALL )
96+ rows_match = re .search (patterns ['rows' ], html_content , re .DOTALL )
97+ areas_match = re .search (patterns ['sol' ], html_content , re .DOTALL )
98+ sol_match = re .search (patterns ['sol' ], html_content , re .DOTALL )
99+ except Exception as e :
100+ self .logger .error (f"Error parsing detail for { text } : { e } " )
101+ return None
102+
103+ # Process data
104+ solution_raw = sol_match .group ().strip ()
105+ cols_raw = cols_match .group ().strip ()
106+ rows_raw = rows_match .group ().strip ()
107+ areas_raw = areas_match .group ().strip ()
108+
109+ rows_list = solution_raw .strip ().split ("\n " )
110+
111+ num_rows = len (rows_list )
112+ num_cols = len (rows_list [0 ].split ()) if num_rows > 0 else 0
113+ if areas_raw == solution_raw :
114+ areas_raw = "\n " .join ([" " .join (["-" for _ in range (num_cols )]) for _ in range (num_rows )])
115+ cnt_list = [0 , 0 , 0 , 0 , 0 ]
116+ sol_mat = [row .strip ().split (" " ) for row in rows_list ]
117+ visited = set ()
118+ for i in range (num_rows ):
119+ for j in range (num_cols ):
120+ if sol_mat [i ][j ] in "-x" or (i , j ) in visited :
121+ continue
122+ elif sol_mat [i ][j ] == "o" :
123+ cnt_list [0 ] += 1
124+ visited .add ((i , j ))
125+ elif sol_mat [i ][j ] == "n" :
126+ k = i
127+ while k < num_rows and sol_mat [k ][j ] != "s" :
128+ visited .add ((k , j ))
129+ k += 1
130+ cnt_list [k - i ] += 1
131+ elif sol_mat [i ][j ] == "w" :
132+ k = j
133+ while k < num_cols and sol_mat [i ][k ] != "e" :
134+ visited .add ((i , k ))
135+ k += 1
136+ cnt_list [k - j ] += 1
137+
138+ header = f"{ num_rows } { num_cols } { cnt_list [0 ]} { cnt_list [1 ]} { cnt_list [2 ]} { cnt_list [3 ]} { cnt_list [4 ]} "
139+ problem_str = f"{ header } \n { cols_raw } \n { rows_raw } \n { areas_raw } "
140+ solution_str = f"{ header } \n { solution_raw } "
141+
142+ puzzle_id = f"{ text } _{ num_rows } x{ num_cols } "
143+
144+ return PuzzleItem (
145+ id = puzzle_id ,
146+ difficulty = 0 , # Placeholder
147+ source_url = metadata .get ('href' , '' ),
148+ problem = problem_str ,
149+ solution = solution_str ,
150+ metadata = metadata
151+ )
152+
153+ except Exception as e :
154+ self .logger .error (f"Error parsing detail for { text } : { e } " )
155+ return None
0 commit comments