Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Add name, description, and url optional data source metadata

Revision ID: 6eb8084e2f48
Revises: 69f7cc4f56d4
Create Date: 2025-03-15 17:45:46.619721

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa

from util.alembic_helpers import switch_enum_type

# revision identifiers, used by Alembic.
revision: str = '6eb8084e2f48'
down_revision: Union[str, None] = '69f7cc4f56d4'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:

Check warning on line 22 in alembic/versions/2025_03_15_1745-6eb8084e2f48_add_name_description_and_url_optional_.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_03_15_1745-6eb8084e2f48_add_name_description_and_url_optional_.py#L22 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_03_15_1745-6eb8084e2f48_add_name_description_and_url_optional_.py:22:1: D103 Missing docstring in public function
# Add name and description columns to URL table
op.add_column('urls', sa.Column('name', sa.String(), nullable=True))
op.add_column('urls', sa.Column('description', sa.String(), nullable=True))

# Create URL_optional_data_source_metadata
op.create_table(
'url_optional_data_source_metadata',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('url_id', sa.Integer(), nullable=False),
sa.Column('record_formats', sa.ARRAY(sa.String()), nullable=True),
sa.Column('data_portal_type', sa.String(), nullable=True),
sa.Column('supplying_entity', sa.String(), nullable=True),
sa.ForeignKeyConstraint(['url_id'], ['urls.id'], ),
sa.PrimaryKeyConstraint('id')
)

# Add 'Misc Metadata' to TaskType enum
switch_enum_type(
table_name='tasks',
column_name='task_type',
enum_name='task_type',
new_enum_values=['HTML', 'Relevancy', 'Record Type', 'Agency Identification', 'Misc Metadata']
)


def downgrade() -> None:

Check warning on line 48 in alembic/versions/2025_03_15_1745-6eb8084e2f48_add_name_description_and_url_optional_.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_03_15_1745-6eb8084e2f48_add_name_description_and_url_optional_.py#L48 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_03_15_1745-6eb8084e2f48_add_name_description_and_url_optional_.py:48:1: D103 Missing docstring in public function
# Remove name and description columns from URL table
op.drop_column('urls', 'name')
op.drop_column('urls', 'description')

# Drop URL_optional_data_source_metadata
op.drop_table('url_optional_data_source_metadata')

# Remove 'Misc Metadata' from TaskType enum
switch_enum_type(
table_name='tasks',
column_name='task_type',
enum_name='task_type',
new_enum_values=['HTML', 'Relevancy', 'Record Type', 'Agency Identification']
)
7 changes: 5 additions & 2 deletions apply_migrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@

from collector_db.helper_functions import get_postgres_connection_string

if __name__ == "__main__":
def apply_migrations():

Check warning on line 6 in apply_migrations.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] apply_migrations.py#L6 <103>

Missing docstring in public function
Raw output
./apply_migrations.py:6:1: D103 Missing docstring in public function
print("Applying migrations...")
alembic_config = Config("alembic.ini")
alembic_config.set_main_option(
"sqlalchemy.url",
get_postgres_connection_string()
)
command.upgrade(alembic_config, "head")
print("Migrations applied.")
print("Migrations applied.")

if __name__ == "__main__":

Check failure on line 16 in apply_migrations.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] apply_migrations.py#L16 <305>

expected 2 blank lines after class or function definition, found 1
Raw output
./apply_migrations.py:16:1: E305 expected 2 blank lines after class or function definition, found 1
apply_migrations()

Check warning on line 17 in apply_migrations.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] apply_migrations.py#L17 <292>

no newline at end of file
Raw output
./apply_migrations.py:17:23: W292 no newline at end of file
155 changes: 68 additions & 87 deletions collector_db/AsyncDatabaseClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Optional, Type

from fastapi import HTTPException
from sqlalchemy import select, exists, func, case, desc, Select, not_, and_
from sqlalchemy import select, exists, func, case, desc, Select, not_, and_, or_, update

Check warning on line 5 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L5 <401>

'sqlalchemy.or_' imported but unused
Raw output
./collector_db/AsyncDatabaseClient.py:5:1: F401 'sqlalchemy.or_' imported but unused
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker
from sqlalchemy.orm import selectinload, joinedload, QueryableAttribute
from sqlalchemy.sql.functions import coalesce
Expand All @@ -22,7 +22,7 @@
from collector_db.models import URL, URLErrorInfo, URLHTMLContent, Base, \
RootURL, Task, TaskError, LinkTaskURL, Batch, Agency, AutomatedUrlAgencySuggestion, \
UserUrlAgencySuggestion, AutoRelevantSuggestion, AutoRecordTypeSuggestion, UserRelevantSuggestion, \
UserRecordTypeSuggestion, ApprovingUserURL
UserRecordTypeSuggestion, ApprovingUserURL, URLOptionalDataSourceMetadata
from collector_manager.enums import URLStatus, CollectorType
from core.DTOs.GetNextRecordTypeAnnotationResponseInfo import GetNextRecordTypeAnnotationResponseInfo
from core.DTOs.GetNextRelevanceAnnotationResponseInfo import GetNextRelevanceAnnotationResponseInfo
Expand All @@ -34,6 +34,7 @@
GetURLsResponseInnerInfo
from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo
from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO
from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO
from core.enums import BatchStatus, SuggestionType, RecordType
from html_tag_collector.DataClassTags import convert_to_response_html_info

Expand Down Expand Up @@ -353,6 +354,68 @@
scalar_result = await session.scalars(statement)
return bool(scalar_result.first())

@session_manager
async def has_pending_urls_missing_miscellaneous_metadata(self, session: AsyncSession) -> bool:

Check warning on line 358 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L358 <102>

Missing docstring in public method
Raw output
./collector_db/AsyncDatabaseClient.py:358:1: D102 Missing docstring in public method
query = StatementComposer.pending_urls_missing_miscellaneous_metadata_query()
query = query.limit(1)

scalar_result = await session.scalars(query)
return bool(scalar_result.first())

@session_manager
async def get_pending_urls_missing_miscellaneous_metadata(

Check warning on line 366 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L366 <102>

Missing docstring in public method
Raw output
./collector_db/AsyncDatabaseClient.py:366:1: D102 Missing docstring in public method
self,
session: AsyncSession
) -> list[URLMiscellaneousMetadataTDO]:
query = StatementComposer.pending_urls_missing_miscellaneous_metadata_query()
query = (
query.options(
selectinload(URL.batch),
).limit(100).order_by(URL.id)
)

scalar_result = await session.scalars(query)
all_results = scalar_result.all()
final_results = []
for result in all_results:
tdo = URLMiscellaneousMetadataTDO(
url_id=result.id,
collector_metadata=result.collector_metadata,
collector_type=CollectorType(result.batch.strategy),
)
final_results.append(tdo)
return final_results

@session_manager
async def add_miscellaneous_metadata(self, session: AsyncSession, tdos: list[URLMiscellaneousMetadataTDO]):

Check warning on line 390 in collector_db/AsyncDatabaseClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/AsyncDatabaseClient.py#L390 <102>

Missing docstring in public method
Raw output
./collector_db/AsyncDatabaseClient.py:390:1: D102 Missing docstring in public method
updates = []

for tdo in tdos:
update_query = update(
URL
).where(
URL.id == tdo.url_id
).values(
name=tdo.name,
description=tdo.description,
)

updates.append(update_query)

for stmt in updates:
await session.execute(stmt)

for tdo in tdos:
metadata_object = URLOptionalDataSourceMetadata(
url_id=tdo.url_id,
record_formats=tdo.record_formats,
data_portal_type=tdo.data_portal_type,
supplying_entity=tdo.supplying_entity
)
session.add(metadata_object)



@session_manager
async def get_pending_urls_without_html_data(self, session: AsyncSession):
# TODO: Add test that includes some urls WITH html data. Check they're not returned
Expand Down Expand Up @@ -433,97 +496,15 @@
)


#TODO: Slated for deletion
@session_manager
async def get_urls_with_html_data_and_without_metadata_type(
self,
session: AsyncSession,
without_metadata_type: URLMetadataAttributeType = URLMetadataAttributeType.RELEVANT
) -> list[URLWithHTML]:

# Get URLs with no relevancy metadata
statement = (select(URL)
.options(selectinload(URL.html_content))
.where(URL.outcome == URLStatus.PENDING.value))
# Exclude URLs with auto suggested record types
statement = self.statement_composer.exclude_urls_with_extant_model(
statement=statement,
model=AutoRecordTypeSuggestion
)
statement = statement.limit(100).order_by(URL.id)


# TODO: The below can probably be generalized


statement = self.statement_composer.exclude_urls_with_select_metadata(
statement=statement,
attribute=without_metadata_type
)
# TODO: Generalize
statement = statement.limit(100).order_by(URL.id)
raw_result = await session.execute(statement)
result = raw_result.all()
url_ids_to_urls = {url_id: url for url_id, url, _ in result}
url_ids_to_html_info = {url_id: [] for url_id, _, _ in result}

for url_id, _, html_info in result:
url_ids_to_html_info[url_id].append(
URLHTMLContentInfo(**html_info.__dict__)
)

final_results = []
for url_id, url in url_ids_to_urls.items():
url_with_html = URLWithHTML(
url_id=url_id,
url=url,
html_infos=url_ids_to_html_info[url_id]
)
final_results.append(url_with_html)

return final_results

@session_manager
async def has_pending_urls_with_html_data_and_without_metadata_type(
self,
session: AsyncSession,
without_metadata_type: URLMetadataAttributeType = URLMetadataAttributeType.RELEVANT
) -> bool:
# TODO: Generalize this so that it can exclude based on other attributes
# Get URLs with no relevancy metadata
statement = (select(URL.id, URL.url, URLHTMLContent).
join(URLHTMLContent).
where(URL.outcome == URLStatus.PENDING.value))
statement = self.statement_composer.exclude_urls_with_select_metadata(
statement=statement,
attribute=without_metadata_type
)
statement = statement.limit(1)
raw_result = await session.execute(statement)
result = raw_result.all()
return len(result) > 0



# @session_manager
# async def get_annotations_for_metadata_id(
# self,
# session: AsyncSession,
# metadata_id: int
# ) -> list[MetadataAnnotation]:
# statement = (select(MetadataAnnotation).
# where(MetadataAnnotation.metadata_id == metadata_id))
# scalar_result = await session.scalars(statement)
# all_results = scalar_result.all()
# return [MetadataAnnotationInfo(**result.__dict__) for result in all_results]

@session_manager
async def get_all(self, session, model: Base):
async def get_all(self, session, model: Base, order_by_attribute: Optional[str] = None) -> list[Base]:
"""
Get all records of a model
Used primarily in testing
"""
statement = select(model)
if order_by_attribute:
statement = statement.order_by(getattr(model, order_by_attribute))
result = await session.execute(statement)
return result.scalars().all()

Expand Down
70 changes: 26 additions & 44 deletions collector_db/StatementComposer.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from typing import Any

from sqlalchemy import Select, select, exists, Table, func, Subquery
from sqlalchemy import Select, select, exists, Table, func, Subquery, and_

Check warning on line 3 in collector_db/StatementComposer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/StatementComposer.py#L3 <401>

'sqlalchemy.Table' imported but unused
Raw output
./collector_db/StatementComposer.py:3:1: F401 'sqlalchemy.Table' imported but unused
from sqlalchemy.orm import aliased

from collector_db.enums import URLMetadataAttributeType, ValidationStatus
from collector_db.models import URL, URLHTMLContent, AutomatedUrlAgencySuggestion
from collector_manager.enums import URLStatus
from collector_db.models import URL, URLHTMLContent, AutomatedUrlAgencySuggestion, URLOptionalDataSourceMetadata, Batch
from collector_manager.enums import URLStatus, CollectorType


class StatementComposer:
Expand Down Expand Up @@ -36,35 +36,7 @@
)
))

@staticmethod
def exclude_urls_with_select_metadata(
statement: Select,
attribute: URLMetadataAttributeType
) -> Select:
return (statement.where(
~exists(
select(URLMetadata.id).
where(
URLMetadata.url_id == URL.id,
URLMetadata.attribute == attribute.value
)
)
))

@staticmethod
def exclude_url_annotated_by_user(
statement: Select,
user_id: int
) -> Select:
return (statement.where(
~exists(
select(MetadataAnnotation.id).
where(
MetadataAnnotation.metadata_id == URLMetadata.id,
MetadataAnnotation.user_id == user_id
)
)
))


@staticmethod
Expand All @@ -88,19 +60,29 @@

return statement


@staticmethod
async def get_all_html_content_for_url(subquery) -> Select:
statement = (
select(
subquery.c.url,
subquery.c.metadata_id,
subquery.c.value,
URLHTMLContent.content_type,
URLHTMLContent.content,
def pending_urls_missing_miscellaneous_metadata_query() -> Select:

Check warning on line 65 in collector_db/StatementComposer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/StatementComposer.py#L65 <102>

Missing docstring in public method
Raw output
./collector_db/StatementComposer.py:65:1: D102 Missing docstring in public method
query = select(URL).where(
and_(
URL.outcome == URLStatus.PENDING.value,

Check failure on line 68 in collector_db/StatementComposer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/StatementComposer.py#L68 <126>

continuation line over-indented for hanging indent
Raw output
./collector_db/StatementComposer.py:68:21: E126 continuation line over-indented for hanging indent
URL.name == None,

Check failure on line 69 in collector_db/StatementComposer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/StatementComposer.py#L69 <711>

comparison to None should be 'if cond is None:'
Raw output
./collector_db/StatementComposer.py:69:30: E711 comparison to None should be 'if cond is None:'
URL.description == None,

Check failure on line 70 in collector_db/StatementComposer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/StatementComposer.py#L70 <711>

comparison to None should be 'if cond is None:'
Raw output
./collector_db/StatementComposer.py:70:37: E711 comparison to None should be 'if cond is None:'
URLOptionalDataSourceMetadata.url_id == None,

Check failure on line 71 in collector_db/StatementComposer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/StatementComposer.py#L71 <711>

comparison to None should be 'if cond is None:'
Raw output
./collector_db/StatementComposer.py:71:58: E711 comparison to None should be 'if cond is None:'
Batch.strategy.in_(
[
CollectorType.AUTO_GOOGLER.value,
CollectorType.CKAN.value,
CollectorType.MUCKROCK_ALL_SEARCH.value,
CollectorType.MUCKROCK_COUNTY_SEARCH.value,
CollectorType.MUCKROCK_SIMPLE_SEARCH.value
]
)
)

Check failure on line 81 in collector_db/StatementComposer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/StatementComposer.py#L81 <121>

continuation line under-indented for hanging indent
Raw output
./collector_db/StatementComposer.py:81:17: E121 continuation line under-indented for hanging indent
).outerjoin(

Check failure on line 82 in collector_db/StatementComposer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/StatementComposer.py#L82 <123>

closing bracket does not match indentation of opening bracket's line
Raw output
./collector_db/StatementComposer.py:82:13: E123 closing bracket does not match indentation of opening bracket's line
URLOptionalDataSourceMetadata
).join(

Check failure on line 84 in collector_db/StatementComposer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/StatementComposer.py#L84 <123>

closing bracket does not match indentation of opening bracket's line
Raw output
./collector_db/StatementComposer.py:84:13: E123 closing bracket does not match indentation of opening bracket's line
Batch
)
.join(URLHTMLContent)
.where(subquery.c.url_id == URLHTMLContent.url_id)
)

raw_result = await session.execute(statement)
result = raw_result.all()
return query

Check warning on line 88 in collector_db/StatementComposer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] collector_db/StatementComposer.py#L88 <292>

no newline at end of file
Raw output
./collector_db/StatementComposer.py:88:21: W292 no newline at end of file
1 change: 1 addition & 0 deletions collector_db/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class TaskType(PyEnum):
RELEVANCY = "Relevancy"
RECORD_TYPE = "Record Type"
AGENCY_IDENTIFICATION = "Agency Identification"
MISC_METADATA = "Misc Metadata"

class PGEnum(TypeDecorator):
impl = postgresql.ENUM
Expand Down
Loading