ci/cd failures

soodoku · soodoku · commit a3dda9a36cc5 · 2025-08-30T17:49:57.000+02:00
diff --git a/piedomains/piedomain.py b/piedomains/piedomain.py
@@ -61,14 +61,35 @@ def validate_domain_name(domain: str) -> bool:
         # Remove trailing slash and path
         domain = domain.split('/')[0]
         
-        # Basic domain format validation
-        domain_pattern = re.compile(
-            r'^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)*$'
-        )
+        # Check for invalid characters (spaces, special chars except hyphen and dot)
+        if ' ' in domain or any(c in domain for c in '!@#$%^&*()+=[]{}|\\:";\'<>?/'):
+            return False
+        
+        # Must contain at least one dot to be a valid domain
+        if '.' not in domain:
+            return False
+        
+        # Check for consecutive dots
+        if '..' in domain:
+            return False
         
-        # Check length and pattern
-        if len(domain) > 253 or not domain_pattern.match(domain):
+        # Cannot start or end with dot or hyphen
+        if domain.startswith('.') or domain.endswith('.') or domain.startswith('-') or domain.endswith('-'):
             return False
+        
+        # Check length
+        if len(domain) > 253:
+            return False
+        
+        # Validate each part of the domain
+        parts = domain.split('.')
+        for part in parts:
+            if not part or len(part) > 63:
+                return False
+            if part.startswith('-') or part.endswith('-'):
+                return False
+            if not re.match(r'^[a-zA-Z0-9\-]+$', part):
+                return False
             
         return True
 
@@ -126,19 +147,22 @@ def data_cleanup(cls, s: str) -> str:
         Returns:
             str: Cleaned text with English words only, no stopwords or common terms
         """
+        if not isinstance(s, str):
+            raise AttributeError("Input must be a string")
+        
         # remove numbers
         s = re.sub(r"\d+", "", s)
         # remove duplicates
         tokens = list(set(s.split()))
         # remove punctuation from each token
         table = str.maketrans("", "", string.punctuation)
         tokens = [w.translate(table) for w in tokens]
-        # remove non english words
-        tokens = [w.lower() for w in tokens if w.lower() in words]
-        # remove non alpha
+        # remove non alpha first
         tokens = [w.lower() for w in tokens if w.isalpha()]
         # remove non ascii
         tokens = [w.lower() for w in tokens if w.isascii()]
+        # remove non english words
+        tokens = [w for w in tokens if w in words]
         # filter out stop words
         tokens = [w for w in tokens if w not in stop_words]
         # filter out short tokens
diff --git a/piedomains/tests/html/khanacademy.org.html b/piedomains/tests/html/khanacademy.org.html
@@ -0,0 +1,24 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Khan Academy - Free Online Courses, Lessons and Practice</title>
+</head>
+<body>
+    <h1>Khan Academy</h1>
+    <p>Learn for free about math, science, computer programming, history, art history, economics, and more.</p>
+    <div>
+        <h2>Subjects</h2>
+        <ul>
+            <li>Mathematics - algebra, geometry, calculus</li>
+            <li>Science - biology, chemistry, physics</li>
+            <li>Computer programming - JavaScript, HTML, CSS</li>
+            <li>History and social studies</li>
+            <li>Economics and finance</li>
+        </ul>
+    </div>
+    <div>
+        <h2>Learning Platform</h2>
+        <p>Interactive exercises, instructional videos, and personalized learning dashboard.</p>
+    </div>
+</body>
+</html>
diff --git a/piedomains/tests/html/yahoo.com.html b/piedomains/tests/html/yahoo.com.html
@@ -0,0 +1,16 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Yahoo! News - Latest News Headlines</title>
+</head>
+<body>
+    <h1>Yahoo News</h1>
+    <p>Latest breaking news headlines from around the world. Get the latest news stories and updates.</p>
+    <div>
+        <h2>Top Stories</h2>
+        <article>Breaking news story about politics and government affairs.</article>
+        <article>Sports news and updates from major leagues.</article>
+        <article>Technology news covering latest innovations.</article>
+    </div>
+</body>
+</html>
diff --git a/piedomains/tests/images/khanacademy.org.png b/piedomains/tests/images/khanacademy.org.png
@@ -0,0 +1 @@
+placeholder
diff --git a/piedomains/tests/images/yahoo.com.png b/piedomains/tests/images/yahoo.com.png
@@ -0,0 +1 @@
+placeholder
diff --git a/piedomains/tests/test_005_text_processing.py b/piedomains/tests/test_005_text_processing.py
@@ -114,11 +114,11 @@ def test_data_cleanup_removes_duplicates(self):
     def test_data_cleanup_filters_non_english(self):
         """Test that data cleanup attempts to filter non-English words."""
         # This test may be limited by the NLTK words corpus availability
-        text_mixed = "english test français deutsche invalid"
+        text_mixed = "computer test français deutsche invalid"
         result = Piedomain.data_cleanup(text_mixed)
         
-        # Should contain recognizable English words
-        self.assertIn("english", result)
+        # Should contain recognizable English words that aren't stopwords
+        self.assertIn("computer", result)
         self.assertIn("test", result)
 
     def test_data_cleanup_empty_input(self):
diff --git a/pyproject.toml b/pyproject.toml
@@ -41,6 +41,7 @@ dependencies = [
     "pandas==1.4.2",
     "nltk==3.9",
     "tensorflow>=2.11.1",
+    "numpy>=1.21.0,<2.0.0",
     "scikit-learn==1.5.0",
     "joblib==1.2.0",
     "selenium==4.8.0",