Police-Data-Accessibility-Project · maxachis · Mar 25, 2025 · Mar 25, 2025 · Mar 25, 2025
@@ -0,0 +1,62 @@
+"""Add name, description, and url optional data source metadata
+
+Revision ID: 6eb8084e2f48
+Revises: 69f7cc4f56d4
+Create Date: 2025-03-15 17:45:46.619721
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+from util.alembic_helpers import switch_enum_type
+
+# revision identifiers, used by Alembic.
+revision: str = '6eb8084e2f48'
+down_revision: Union[str, None] = '69f7cc4f56d4'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # Add name and description columns to URL table
+    op.add_column('urls', sa.Column('name', sa.String(), nullable=True))
+    op.add_column('urls', sa.Column('description', sa.String(), nullable=True))
+
+    # Create URL_optional_data_source_metadata
+    op.create_table(
+        'url_optional_data_source_metadata',
+        sa.Column('id', sa.Integer(), nullable=False),
+        sa.Column('url_id', sa.Integer(), nullable=False),
+        sa.Column('record_formats', sa.ARRAY(sa.String()), nullable=True),
+        sa.Column('data_portal_type', sa.String(), nullable=True),
+        sa.Column('supplying_entity', sa.String(), nullable=True),
+        sa.ForeignKeyConstraint(['url_id'], ['urls.id'], ),
+        sa.PrimaryKeyConstraint('id')
+    )
+
+    # Add 'Misc Metadata' to TaskType enum
+    switch_enum_type(
+        table_name='tasks',
+        column_name='task_type',
+        enum_name='task_type',
+        new_enum_values=['HTML', 'Relevancy', 'Record Type', 'Agency Identification', 'Misc Metadata']
+    )
+
+
+def downgrade() -> None:
+    # Remove name and description columns from URL table
+    op.drop_column('urls', 'name')
+    op.drop_column('urls', 'description')
+
+    # Drop URL_optional_data_source_metadata
+    op.drop_table('url_optional_data_source_metadata')
+
+    # Remove 'Misc Metadata' from TaskType enum
+    switch_enum_type(
+        table_name='tasks',
+        column_name='task_type',
+        enum_name='task_type',
+        new_enum_values=['HTML', 'Relevancy', 'Record Type', 'Agency Identification']
+    )
@@ -3,12 +3,15 @@
 
 from collector_db.helper_functions import get_postgres_connection_string
 
-if __name__ == "__main__":
+def apply_migrations():
     print("Applying migrations...")
     alembic_config = Config("alembic.ini")
     alembic_config.set_main_option(
         "sqlalchemy.url",
         get_postgres_connection_string()
     )
     command.upgrade(alembic_config, "head")
-    print("Migrations applied.")
+    print("Migrations applied.")
+
+if __name__ == "__main__":
+    apply_migrations()
@@ -2,7 +2,7 @@
 from typing import Optional, Type
 
 from fastapi import HTTPException
-from sqlalchemy import select, exists, func, case, desc, Select, not_, and_
+from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, or_, update
 from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker
 from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute
 from sqlalchemy.sql.functions import coalesce
@@ -22,7 +22,7 @@
 from collector_db.models import URL, URLErrorInfo, URLHTMLContent, Base, \
     RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \
     UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \
-    UserRecordTypeSuggestion, ApprovingUserURL
+    UserRecordTypeSuggestion, ApprovingUserURL, URLOptionalDataSourceMetadata
 from collector_manager.enums import URLStatus, CollectorType
 from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseInfo
 from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo
@@ -34,6 +34,7 @@
     GetURLsResponseInnerInfo
 from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo
 from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO
+from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO
 from core.enums import BatchStatus, SuggestionType, RecordType
 from html_tag_collector.DataClassTags import convert_to_response_html_info
 
@@ -353,6 +354,68 @@
         scalar_result = await session.scalars(statement)
         return bool(scalar_result.first())
 
+    @session_manager
+    async def has_pending_urls_missing_miscellaneous_metadata(self, session: AsyncSession) -> bool:
+        query = StatementComposer.pending_urls_missing_miscellaneous_metadata_query()
+        query = query.limit(1)
+
+        scalar_result = await session.scalars(query)
+        return bool(scalar_result.first())
+
+    @session_manager
+    async def get_pending_urls_missing_miscellaneous_metadata(
+            self,
+            session: AsyncSession
+    ) -> list[URLMiscellaneousMetadataTDO]:
+        query = StatementComposer.pending_urls_missing_miscellaneous_metadata_query()
+        query = (
+            query.options(
+                selectinload(URL.batch),
+            ).limit(100).order_by(URL.id)
+        )
+
+        scalar_result = await session.scalars(query)
+        all_results = scalar_result.all()
+        final_results = []
+        for result in all_results:
+            tdo = URLMiscellaneousMetadataTDO(
+                url_id=result.id,
+                collector_metadata=result.collector_metadata,
+                collector_type=CollectorType(result.batch.strategy),
+            )
+            final_results.append(tdo)
+        return final_results
+
+    @session_manager
+    async def add_miscellaneous_metadata(self, session: AsyncSession, tdos: list[URLMiscellaneousMetadataTDO]):
+        updates = []
+
+        for tdo in tdos:
+            update_query = update(
+                URL
+            ).where(
+                URL.id == tdo.url_id
+            ).values(
+                name=tdo.name,
+                description=tdo.description,
+            )
+
+            updates.append(update_query)
+
+        for stmt in updates:
+            await session.execute(stmt)
+
+        for tdo in tdos:
+            metadata_object = URLOptionalDataSourceMetadata(
+                url_id=tdo.url_id,
+                record_formats=tdo.record_formats,
+                data_portal_type=tdo.data_portal_type,
+                supplying_entity=tdo.supplying_entity
+            )
+            session.add(metadata_object)
+
+
+
     @session_manager
     async def get_pending_urls_without_html_data(self, session: AsyncSession):
         # TODO: Add test that includes some urls WITH html data. Check they're not returned
@@ -433,97 +496,15 @@
         )
 
 
-    #TODO: Slated for deletion
-    @session_manager
-    async def get_urls_with_html_data_and_without_metadata_type(
-            self,
-            session: AsyncSession,
-            without_metadata_type: URLMetadataAttributeType = URLMetadataAttributeType.RELEVANT
-    ) -> list[URLWithHTML]:
-
-        # Get URLs with no relevancy metadata
-        statement = (select(URL)
-                     .options(selectinload(URL.html_content))
-                     .where(URL.outcome == URLStatus.PENDING.value))
-        # Exclude URLs with auto suggested record types
-        statement = self.statement_composer.exclude_urls_with_extant_model(
-            statement=statement,
-            model=AutoRecordTypeSuggestion
-        )
-        statement = statement.limit(100).order_by(URL.id)
-
-
-        # TODO: The below can probably be generalized
-
-
-        statement = self.statement_composer.exclude_urls_with_select_metadata(
-            statement=statement,
-            attribute=without_metadata_type
-        )
-        # TODO: Generalize
-        statement = statement.limit(100).order_by(URL.id)
-        raw_result = await session.execute(statement)
-        result = raw_result.all()
-        url_ids_to_urls = {url_id: url for url_id, url, _ in result}
-        url_ids_to_html_info = {url_id: [] for url_id, _, _ in result}
-
-        for url_id, _, html_info in result:
-            url_ids_to_html_info[url_id].append(
-                URLHTMLContentInfo(**html_info.__dict__)
-            )
-
-        final_results = []
-        for url_id, url in url_ids_to_urls.items():
-            url_with_html = URLWithHTML(
-                url_id=url_id,
-                url=url,
-                html_infos=url_ids_to_html_info[url_id]
-            )
-            final_results.append(url_with_html)
-
-        return final_results
-
-    @session_manager
-    async def has_pending_urls_with_html_data_and_without_metadata_type(
-            self,
-            session: AsyncSession,
-            without_metadata_type: URLMetadataAttributeType = URLMetadataAttributeType.RELEVANT
-    ) -> bool:
-        # TODO: Generalize this so that it can exclude based on other attributes
-        # Get URLs with no relevancy metadata
-        statement = (select(URL.id, URL.url, URLHTMLContent).
-                     join(URLHTMLContent).
-                     where(URL.outcome == URLStatus.PENDING.value))
-        statement = self.statement_composer.exclude_urls_with_select_metadata(
-            statement=statement,
-            attribute=without_metadata_type
-        )
-        statement = statement.limit(1)
-        raw_result = await session.execute(statement)
-        result = raw_result.all()
-        return len(result) > 0
-
-
-
-    # @session_manager
-    # async def get_annotations_for_metadata_id(
-    #         self,
-    #         session: AsyncSession,
-    #         metadata_id: int
-    # ) -> list[MetadataAnnotation]:
-    #     statement = (select(MetadataAnnotation).
-    #                  where(MetadataAnnotation.metadata_id == metadata_id))
-    #     scalar_result = await session.scalars(statement)
-    #     all_results = scalar_result.all()
-    #     return [MetadataAnnotationInfo(**result.__dict__) for result in all_results]
-
     @session_manager
-    async def get_all(self, session, model: Base):
+    async def get_all(self, session, model: Base, order_by_attribute: Optional[str] = None) -> list[Base]:
         """
         Get all records of a model
         Used primarily in testing
         """
         statement = select(model)
+        if order_by_attribute:
+            statement = statement.order_by(getattr(model, order_by_attribute))
         result = await session.execute(statement)
         return result.scalars().all()
 

@@ -1,11 +1,11 @@
 from typing import Any
 
-from sqlalchemy import Select, select, exists, Table, func, Subquery
+from sqlalchemy import Select, select, exists, Table, func, Subquery, and_
 from sqlalchemy.orm import aliased
 
 from collector_db.enums import URLMetadataAttributeType, ValidationStatus
-from collector_db.models import URL, URLHTMLContent, AutomatedUrlAgencySuggestion
-from collector_manager.enums import URLStatus
+from collector_db.models import URL, URLHTMLContent, AutomatedUrlAgencySuggestion, URLOptionalDataSourceMetadata, Batch
+from collector_manager.enums import URLStatus, CollectorType
 
 
 class StatementComposer:
@@ -36,35 +36,7 @@
                         )
                     ))
 
-    @staticmethod
-    def exclude_urls_with_select_metadata(
-            statement: Select,
-            attribute: URLMetadataAttributeType
-    ) -> Select:
-        return (statement.where(
-                        ~exists(
-                            select(URLMetadata.id).
-                            where(
-                                URLMetadata.url_id == URL.id,
-                                URLMetadata.attribute == attribute.value
-                            )
-                        )
-                    ))
 
-    @staticmethod
-    def exclude_url_annotated_by_user(
-            statement: Select,
-            user_id: int
-    ) -> Select:
-        return (statement.where(
-                        ~exists(
-                            select(MetadataAnnotation.id).
-                            where(
-                                MetadataAnnotation.metadata_id == URLMetadata.id,
-                                MetadataAnnotation.user_id == user_id
-                            )
-                        )
-                    ))
 
 
     @staticmethod
@@ -88,19 +60,29 @@
 
         return statement
 
+
     @staticmethod
-    async def get_all_html_content_for_url(subquery) -> Select:
-        statement = (
-            select(
-                subquery.c.url,
-                subquery.c.metadata_id,
-                subquery.c.value,
-                URLHTMLContent.content_type,
-                URLHTMLContent.content,
+    def pending_urls_missing_miscellaneous_metadata_query() -> Select:
+        query = select(URL).where(
+            and_(
+                    URL.outcome == URLStatus.PENDING.value,
+                    URL.name == None,
+                    URL.description == None,
+                    URLOptionalDataSourceMetadata.url_id == None,
+                    Batch.strategy.in_(
+                        [
+                            CollectorType.AUTO_GOOGLER.value,
+                            CollectorType.CKAN.value,
+                            CollectorType.MUCKROCK_ALL_SEARCH.value,
+                            CollectorType.MUCKROCK_COUNTY_SEARCH.value,
+                            CollectorType.MUCKROCK_SIMPLE_SEARCH.value
+                        ]
+                    )
+                )
+            ).outerjoin(
+                URLOptionalDataSourceMetadata
+            ).join(
+                Batch
             )
-            .join(URLHTMLContent)
-            .where(subquery.c.url_id == URLHTMLContent.url_id)
-        )
 
-        raw_result = await session.execute(statement)
-        result = raw_result.all()
+        return query
@@ -37,6 +37,7 @@ class TaskType(PyEnum):
     RELEVANCY = "Relevancy"
     RECORD_TYPE = "Record Type"
     AGENCY_IDENTIFICATION = "Agency Identification"
+    MISC_METADATA = "Misc Metadata"
 
 class PGEnum(TypeDecorator):
     impl = postgresql.ENUM