Skip to content

Commit a3dda9a

Browse files
committed
ci/cd failures
1 parent 64b3dc8 commit a3dda9a

File tree

7 files changed

+79
-12
lines changed

7 files changed

+79
-12
lines changed

piedomains/piedomain.py

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,35 @@ def validate_domain_name(domain: str) -> bool:
6161
# Remove trailing slash and path
6262
domain = domain.split('/')[0]
6363

64-
# Basic domain format validation
65-
domain_pattern = re.compile(
66-
r'^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)*$'
67-
)
64+
# Check for invalid characters (spaces, special chars except hyphen and dot)
65+
if ' ' in domain or any(c in domain for c in '!@#$%^&*()+=[]{}|\\:";\'<>?/'):
66+
return False
67+
68+
# Must contain at least one dot to be a valid domain
69+
if '.' not in domain:
70+
return False
71+
72+
# Check for consecutive dots
73+
if '..' in domain:
74+
return False
6875

69-
# Check length and pattern
70-
if len(domain) > 253 or not domain_pattern.match(domain):
76+
# Cannot start or end with dot or hyphen
77+
if domain.startswith('.') or domain.endswith('.') or domain.startswith('-') or domain.endswith('-'):
7178
return False
79+
80+
# Check length
81+
if len(domain) > 253:
82+
return False
83+
84+
# Validate each part of the domain
85+
parts = domain.split('.')
86+
for part in parts:
87+
if not part or len(part) > 63:
88+
return False
89+
if part.startswith('-') or part.endswith('-'):
90+
return False
91+
if not re.match(r'^[a-zA-Z0-9\-]+$', part):
92+
return False
7293

7394
return True
7495

@@ -126,19 +147,22 @@ def data_cleanup(cls, s: str) -> str:
126147
Returns:
127148
str: Cleaned text with English words only, no stopwords or common terms
128149
"""
150+
if not isinstance(s, str):
151+
raise AttributeError("Input must be a string")
152+
129153
# remove numbers
130154
s = re.sub(r"\d+", "", s)
131155
# remove duplicates
132156
tokens = list(set(s.split()))
133157
# remove punctuation from each token
134158
table = str.maketrans("", "", string.punctuation)
135159
tokens = [w.translate(table) for w in tokens]
136-
# remove non english words
137-
tokens = [w.lower() for w in tokens if w.lower() in words]
138-
# remove non alpha
160+
# remove non alpha first
139161
tokens = [w.lower() for w in tokens if w.isalpha()]
140162
# remove non ascii
141163
tokens = [w.lower() for w in tokens if w.isascii()]
164+
# remove non english words
165+
tokens = [w for w in tokens if w in words]
142166
# filter out stop words
143167
tokens = [w for w in tokens if w not in stop_words]
144168
# filter out short tokens
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<title>Khan Academy - Free Online Courses, Lessons and Practice</title>
5+
</head>
6+
<body>
7+
<h1>Khan Academy</h1>
8+
<p>Learn for free about math, science, computer programming, history, art history, economics, and more.</p>
9+
<div>
10+
<h2>Subjects</h2>
11+
<ul>
12+
<li>Mathematics - algebra, geometry, calculus</li>
13+
<li>Science - biology, chemistry, physics</li>
14+
<li>Computer programming - JavaScript, HTML, CSS</li>
15+
<li>History and social studies</li>
16+
<li>Economics and finance</li>
17+
</ul>
18+
</div>
19+
<div>
20+
<h2>Learning Platform</h2>
21+
<p>Interactive exercises, instructional videos, and personalized learning dashboard.</p>
22+
</div>
23+
</body>
24+
</html>
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<title>Yahoo! News - Latest News Headlines</title>
5+
</head>
6+
<body>
7+
<h1>Yahoo News</h1>
8+
<p>Latest breaking news headlines from around the world. Get the latest news stories and updates.</p>
9+
<div>
10+
<h2>Top Stories</h2>
11+
<article>Breaking news story about politics and government affairs.</article>
12+
<article>Sports news and updates from major leagues.</article>
13+
<article>Technology news covering latest innovations.</article>
14+
</div>
15+
</body>
16+
</html>
Lines changed: 1 addition & 0 deletions
Loading
Lines changed: 1 addition & 0 deletions
Loading

piedomains/tests/test_005_text_processing.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,11 +114,11 @@ def test_data_cleanup_removes_duplicates(self):
114114
def test_data_cleanup_filters_non_english(self):
115115
"""Test that data cleanup attempts to filter non-English words."""
116116
# This test may be limited by the NLTK words corpus availability
117-
text_mixed = "english test français deutsche invalid"
117+
text_mixed = "computer test français deutsche invalid"
118118
result = Piedomain.data_cleanup(text_mixed)
119119

120-
# Should contain recognizable English words
121-
self.assertIn("english", result)
120+
# Should contain recognizable English words that aren't stopwords
121+
self.assertIn("computer", result)
122122
self.assertIn("test", result)
123123

124124
def test_data_cleanup_empty_input(self):

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ dependencies = [
4141
"pandas==1.4.2",
4242
"nltk==3.9",
4343
"tensorflow>=2.11.1",
44+
"numpy>=1.21.0,<2.0.0",
4445
"scikit-learn==1.5.0",
4546
"joblib==1.2.0",
4647
"selenium==4.8.0",

0 commit comments

Comments
 (0)