diff --git a/.github/workflows/test_app.yml b/.github/workflows/test_app.yml index c83608ac..e16d1771 100644 --- a/.github/workflows/test_app.yml +++ b/.github/workflows/test_app.yml @@ -21,7 +21,7 @@ jobs: container-job: runs-on: ubuntu-latest timeout-minutes: 20 - container: python:3.12.8 + container: python:3.11.9 services: postgres: diff --git a/api/routes/annotate.py b/api/routes/annotate.py index 25eab1d3..27b21708 100644 --- a/api/routes/annotate.py +++ b/api/routes/annotate.py @@ -1,9 +1,11 @@ from fastapi import APIRouter, Depends, Path from api.dependencies import get_async_core +from collector_db.enums import URLMetadataAttributeType from core.AsyncCore import AsyncCore -from core.DTOs.GetNextURLForRelevanceAnnotationResponse import GetNextURLForRelevanceAnnotationResponse -from core.DTOs.RelevanceAnnotationInfo import RelevanceAnnotationPostInfo +from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse +from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo +from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from security_manager.SecurityManager import get_access_info, AccessInfo annotate_router = APIRouter( @@ -17,8 +19,11 @@ async def get_next_url_for_relevance_annotation( access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), -) -> GetNextURLForRelevanceAnnotationResponse: - result = await async_core.get_next_url_for_relevance_annotation(user_id=access_info.user_id) +) -> GetNextURLForAnnotationResponse: + result = await async_core.get_next_url_for_annotation( + user_id=access_info.user_id, + metadata_type=URLMetadataAttributeType.RELEVANT + ) return result @@ -28,14 +33,43 @@ async def annotate_url_for_relevance_and_get_next_url( metadata_id: int = Path(description="The metadata id for the associated URL metadata"), async_core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info) -) -> GetNextURLForRelevanceAnnotationResponse: +) -> GetNextURLForAnnotationResponse: + """ + Post URL annotation and get next URL to annotate + """ + result = await async_core.submit_and_get_next_url_for_annotation( + user_id=access_info.user_id, + metadata_id=metadata_id, + annotation=str(relevance_annotation_post_info.is_relevant), + metadata_type = URLMetadataAttributeType.RELEVANT + ) + return result + +@annotate_router.get("/record-type") +async def get_next_url_for_record_type_annotation( + access_info: AccessInfo = Depends(get_access_info), + async_core: AsyncCore = Depends(get_async_core), +) -> GetNextURLForAnnotationResponse: + result = await async_core.get_next_url_for_annotation( + user_id=access_info.user_id, + metadata_type=URLMetadataAttributeType.RECORD_TYPE + ) + return result + +@annotate_router.post("/record-type/{metadata_id}") +async def annotate_url_for_record_type_and_get_next_url( + record_type_annotation_post_info: RecordTypeAnnotationPostInfo, + metadata_id: int = Path(description="The metadata id for the associated URL metadata"), + async_core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info) +) -> GetNextURLForAnnotationResponse: """ Post URL annotation and get next URL to annotate """ - await async_core.submit_url_relevance_annotation( + result = await async_core.submit_and_get_next_url_for_annotation( user_id=access_info.user_id, metadata_id=metadata_id, - annotation=relevance_annotation_post_info + annotation=record_type_annotation_post_info.record_type.value, + metadata_type=URLMetadataAttributeType.RECORD_TYPE ) - result = await async_core.get_next_url_for_relevance_annotation(user_id=access_info.user_id) return result diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 07f1cc10..04d40a82 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -23,7 +23,7 @@ from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseMetadataInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo -from core.DTOs.RelevanceAnnotationInfo import RelevanceAnnotationPostInfo +from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from core.enums import BatchStatus @@ -232,10 +232,11 @@ async def update_url_metadata_status(self, session: AsyncSession, metadata_ids: url_metadata.validation_status = validation_status @session_manager - async def get_next_url_for_relevance_annotation( + async def get_next_url_for_annotation( self, session: AsyncSession, - user_id: int + user_id: int, + metadata_type: URLMetadataAttributeType ) -> URLAnnotationInfo: # Get a URL, its relevancy metadata ID, and HTML data # For a URL which has not yet been annotated by this user id @@ -246,10 +247,11 @@ async def get_next_url_for_relevance_annotation( URL.id.label("url_id"), URL.url, URLMetadata.id.label("metadata_id"), + URLMetadata.value, ) .join(URLMetadata) # Metadata must be relevant - .where(URLMetadata.attribute == URLMetadataAttributeType.RELEVANT.value) + .where(URLMetadata.attribute == metadata_type.value) # Metadata must not be validated .where(URLMetadata.validation_status == ValidationStatus.PENDING_VALIDATION.value) # URL must have HTML content entries @@ -274,6 +276,7 @@ async def get_next_url_for_relevance_annotation( select( subquery.c.url, subquery.c.metadata_id, + subquery.c.value, URLHTMLContent.content_type, URLHTMLContent.content, ) @@ -291,9 +294,10 @@ async def get_next_url_for_relevance_annotation( annotation_info = URLAnnotationInfo( url=result[0][0], metadata_id=result[0][1], + suggested_value=result[0][2], html_infos=[] ) - for _, _, content_type, content in result: + for _, _, _, content_type, content in result: html_info = URLHTMLContentInfo( content_type=content_type, content=content @@ -307,11 +311,12 @@ async def add_relevance_annotation( session: AsyncSession, user_id: int, metadata_id: int, - annotation_info: RelevanceAnnotationPostInfo): + annotation: str + ): annotation = MetadataAnnotation( metadata_id=metadata_id, user_id=user_id, - value=str(annotation_info.is_relevant) + value=annotation ) session.add(annotation) diff --git a/collector_db/DTOs/URLAnnotationInfo.py b/collector_db/DTOs/URLAnnotationInfo.py index 54792dfc..844b226d 100644 --- a/collector_db/DTOs/URLAnnotationInfo.py +++ b/collector_db/DTOs/URLAnnotationInfo.py @@ -6,4 +6,5 @@ class URLAnnotationInfo(BaseModel): metadata_id: int url: str - html_infos: list[URLHTMLContentInfo] \ No newline at end of file + html_infos: list[URLHTMLContentInfo] + suggested_value: str \ No newline at end of file diff --git a/collector_db/StatementComposer.py b/collector_db/StatementComposer.py index dc756fb3..c042e10c 100644 --- a/collector_db/StatementComposer.py +++ b/collector_db/StatementComposer.py @@ -1,8 +1,8 @@ from sqlalchemy import Select, select, exists, Table, func, Subquery -from collector_db.enums import URLMetadataAttributeType -from collector_db.models import URL, URLHTMLContent, URLMetadata +from collector_db.enums import URLMetadataAttributeType, ValidationStatus +from collector_db.models import URL, URLHTMLContent, URLMetadata, MetadataAnnotation from collector_manager.enums import URLStatus @@ -33,6 +33,22 @@ def exclude_urls_with_select_metadata( ) )) + @staticmethod + def exclude_url_annotated_by_user( + statement: Select, + user_id: int + ) -> Select: + return (statement.where( + ~exists( + select(MetadataAnnotation.id). + where( + MetadataAnnotation.metadata_id == URLMetadata.id, + MetadataAnnotation.user_id == user_id + ) + ) + )) + + @staticmethod def simple_count_subquery(model, attribute: str, label: str) -> Subquery: attr_value = getattr(model, attribute) diff --git a/core/AsyncCore.py b/core/AsyncCore.py index afa5c7ab..6ab9fcf5 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -3,12 +3,11 @@ from collector_db.AsyncDatabaseClient import AsyncDatabaseClient from collector_db.DTOs.TaskInfo import TaskInfo from collector_db.DTOs.URLAnnotationInfo import URLAnnotationInfo -from collector_db.enums import TaskType -from core.DTOs.GetNextURLForRelevanceAnnotationResponse import GetNextURLForRelevanceAnnotationResponse +from collector_db.enums import TaskType, URLMetadataAttributeType +from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo -from core.DTOs.RelevanceAnnotationInfo import RelevanceAnnotationPostInfo -from core.DTOs.RelevanceAnnotationRequestInfo import RelevanceAnnotationRequestInfo +from core.DTOs.AnnotationRequestInfo import AnnotationRequestInfo from core.classes.URLHTMLTaskOperator import URLHTMLTaskOperator from core.classes.URLRecordTypeTaskOperator import URLRecordTypeTaskOperator from core.classes.URLRelevanceHuggingfaceTaskOperator import URLRelevanceHuggingfaceTaskOperator @@ -66,39 +65,62 @@ async def run_tasks(self): await self.run_url_relevance_huggingface_task() await self.run_url_record_type_task() - async def convert_to_relevance_annotation_request_info(self, url_info: URLAnnotationInfo) -> RelevanceAnnotationRequestInfo: + async def convert_to_annotation_request_info(self, url_info: URLAnnotationInfo) -> AnnotationRequestInfo: response_html_info = convert_to_response_html_info( html_content_infos=url_info.html_infos ) - return RelevanceAnnotationRequestInfo( + return AnnotationRequestInfo( url=url_info.url, metadata_id=url_info.metadata_id, - html_info=response_html_info + html_info=response_html_info, + suggested_value=url_info.suggested_value ) - async def get_next_url_for_relevance_annotation(self, user_id: int) -> GetNextURLForRelevanceAnnotationResponse: - response = GetNextURLForRelevanceAnnotationResponse() - ua_info: URLAnnotationInfo = await self.adb_client.get_next_url_for_relevance_annotation(user_id=user_id) + async def get_next_url_for_annotation(self, user_id: int, metadata_type: URLMetadataAttributeType) -> GetNextURLForAnnotationResponse: + response = GetNextURLForAnnotationResponse() + ua_info: URLAnnotationInfo = await self.adb_client.get_next_url_for_annotation( + user_id=user_id, + metadata_type=metadata_type + ) if ua_info is None: return response # Format result - result = await self.convert_to_relevance_annotation_request_info(url_info=ua_info) + result = await self.convert_to_annotation_request_info(url_info=ua_info) response.next_annotation = result return response + async def submit_and_get_next_url_for_annotation( + self, + user_id: int, + metadata_id: int, + annotation: str, + metadata_type: URLMetadataAttributeType + ) -> GetNextURLForAnnotationResponse: + await self.submit_url_annotation( + user_id=user_id, + metadata_id=metadata_id, + annotation=annotation, + metadata_type=metadata_type + ) + result = await self.get_next_url_for_annotation( + user_id=user_id, + metadata_type=metadata_type + ) + return result - async def submit_url_relevance_annotation( + async def submit_url_annotation( self, user_id: int, metadata_id: int, - annotation: RelevanceAnnotationPostInfo - ) -> GetNextURLForRelevanceAnnotationResponse: + annotation: str, + metadata_type: URLMetadataAttributeType + ) -> GetNextURLForAnnotationResponse: await self.adb_client.add_relevance_annotation( user_id=user_id, metadata_id=metadata_id, - annotation_info=annotation) - return await self.get_next_url_for_relevance_annotation(user_id=user_id) + annotation=annotation) + return await self.get_next_url_for_annotation(user_id=user_id, metadata_type=metadata_type) async def get_urls(self, page: int, errors: bool) -> GetURLsResponseInfo: return await self.adb_client.get_urls(page=page, errors=errors) diff --git a/core/DTOs/RelevanceAnnotationRequestInfo.py b/core/DTOs/AnnotationRequestInfo.py similarity index 57% rename from core/DTOs/RelevanceAnnotationRequestInfo.py rename to core/DTOs/AnnotationRequestInfo.py index de4036db..1e886ae8 100644 --- a/core/DTOs/RelevanceAnnotationRequestInfo.py +++ b/core/DTOs/AnnotationRequestInfo.py @@ -3,7 +3,8 @@ from html_tag_collector.DataClassTags import ResponseHTMLInfo -class RelevanceAnnotationRequestInfo(BaseModel): +class AnnotationRequestInfo(BaseModel): url: str metadata_id: int - html_info: ResponseHTMLInfo \ No newline at end of file + html_info: ResponseHTMLInfo + suggested_value: str \ No newline at end of file diff --git a/core/DTOs/GetNextURLForAnnotationResponse.py b/core/DTOs/GetNextURLForAnnotationResponse.py new file mode 100644 index 00000000..b4bc1087 --- /dev/null +++ b/core/DTOs/GetNextURLForAnnotationResponse.py @@ -0,0 +1,9 @@ +from typing import Optional + +from pydantic import BaseModel + +from core.DTOs.AnnotationRequestInfo import AnnotationRequestInfo + + +class GetNextURLForAnnotationResponse(BaseModel): + next_annotation: Optional[AnnotationRequestInfo] = None diff --git a/core/DTOs/GetNextURLForRelevanceAnnotationResponse.py b/core/DTOs/GetNextURLForRelevanceAnnotationResponse.py deleted file mode 100644 index a58a4565..00000000 --- a/core/DTOs/GetNextURLForRelevanceAnnotationResponse.py +++ /dev/null @@ -1,9 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - -from core.DTOs.RelevanceAnnotationRequestInfo import RelevanceAnnotationRequestInfo - - -class GetNextURLForRelevanceAnnotationResponse(BaseModel): - next_annotation: Optional[RelevanceAnnotationRequestInfo] = None diff --git a/core/DTOs/RecordTypeAnnotationPostInfo.py b/core/DTOs/RecordTypeAnnotationPostInfo.py new file mode 100644 index 00000000..87e8b674 --- /dev/null +++ b/core/DTOs/RecordTypeAnnotationPostInfo.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel + +from core.enums import RecordType + + +class RecordTypeAnnotationPostInfo(BaseModel): + record_type: RecordType \ No newline at end of file diff --git a/core/DTOs/RelevanceAnnotationInfo.py b/core/DTOs/RelevanceAnnotationPostInfo.py similarity index 100% rename from core/DTOs/RelevanceAnnotationInfo.py rename to core/DTOs/RelevanceAnnotationPostInfo.py diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index 220b6645..d3e60e1d 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -12,13 +12,14 @@ from core.DTOs.GetBatchLogsResponse import GetBatchLogsResponse from core.DTOs.GetBatchStatusResponse import GetBatchStatusResponse from core.DTOs.GetDuplicatesByBatchResponse import GetDuplicatesByBatchResponse -from core.DTOs.GetNextURLForRelevanceAnnotationResponse import GetNextURLForRelevanceAnnotationResponse +from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo from core.DTOs.MessageCountResponse import MessageCountResponse from core.DTOs.MessageResponse import MessageResponse -from core.DTOs.RelevanceAnnotationInfo import RelevanceAnnotationPostInfo +from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo +from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo from core.enums import BatchStatus from util.helper_functions import update_if_not_none @@ -175,22 +176,39 @@ def process_relevancy(self) -> MessageCountResponse: ) return MessageCountResponse(**data) - def get_next_relevance_annotation(self) -> GetNextURLForRelevanceAnnotationResponse: + def get_next_relevance_annotation(self) -> GetNextURLForAnnotationResponse: data = self.get( url=f"/annotate/relevance" ) - return GetNextURLForRelevanceAnnotationResponse(**data) + return GetNextURLForAnnotationResponse(**data) + + def get_next_record_type_annotation(self) -> GetNextURLForAnnotationResponse: + data = self.get( + url=f"/annotate/record-type" + ) + return GetNextURLForAnnotationResponse(**data) + + def post_record_type_annotation_and_get_next( + self, + metadata_id: int, + record_type_annotation_post_info: RecordTypeAnnotationPostInfo + ) -> GetNextURLForAnnotationResponse: + data = self.post( + url=f"/annotate/record-type/{metadata_id}", + json=record_type_annotation_post_info.model_dump(mode='json') + ) + return GetNextURLForAnnotationResponse(**data) def post_relevance_annotation_and_get_next( self, metadata_id: int, relevance_annotation_post_info: RelevanceAnnotationPostInfo - ) -> GetNextURLForRelevanceAnnotationResponse: + ) -> GetNextURLForAnnotationResponse: data = self.post( url=f"/annotate/relevance/{metadata_id}", - json=relevance_annotation_post_info.model_dump() + json=relevance_annotation_post_info.model_dump(mode='json') ) - return GetNextURLForRelevanceAnnotationResponse(**data) + return GetNextURLForAnnotationResponse(**data) def get_urls(self, page: int = 1, errors: bool = False) -> GetURLsResponseInfo: data = self.get( diff --git a/tests/test_automated/integration/api/test_annotate.py b/tests/test_automated/integration/api/test_annotate.py index 5b8730cf..1ee03963 100644 --- a/tests/test_automated/integration/api/test_annotate.py +++ b/tests/test_automated/integration/api/test_annotate.py @@ -1,14 +1,23 @@ +from typing import Any + import pytest from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo from collector_db.enums import URLMetadataAttributeType, ValidationStatus, ValidationSource -from core.DTOs.GetNextURLForRelevanceAnnotationResponse import GetNextURLForRelevanceAnnotationResponse -from core.DTOs.RelevanceAnnotationInfo import RelevanceAnnotationPostInfo +from core.DTOs.GetNextURLForAnnotationResponse import GetNextURLForAnnotationResponse +from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo +from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo +from core.enums import RecordType from tests.test_automated.integration.api.conftest import MOCK_USER_ID - -@pytest.mark.asyncio -async def test_annotate(api_test_helper): +async def run_annotation_test( + api_test_helper, + submit_and_get_next_function: callable, + get_next_function: callable, + post_info: Any, + metadata_attribute: URLMetadataAttributeType, + expected_metadata_value: str +): ath = api_test_helper # Create batch with status `in-process` and strategy `example` @@ -20,7 +29,7 @@ async def test_annotate(api_test_helper): url_2 = iui.url_mappings[1] kwargs = { - "attribute": URLMetadataAttributeType.RELEVANT, + "attribute": metadata_attribute, "validation_status": ValidationStatus.PENDING_VALIDATION, "validation_source": ValidationSource.MACHINE_LEARNING } @@ -39,20 +48,18 @@ async def test_annotate(api_test_helper): # Add HTML data to both await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) # Call `GET` `/annotate/url` and receive next URL - request_info_1: GetNextURLForRelevanceAnnotationResponse = ath.request_validator.get_next_relevance_annotation() + request_info_1: GetNextURLForAnnotationResponse = get_next_function() inner_info_1 = request_info_1.next_annotation # Validate presence of HTML data in `html` field assert inner_info_1.html_info.description != "" assert inner_info_1.html_info.title != "" + assert inner_info_1.suggested_value == "False" - post_info = RelevanceAnnotationPostInfo( - is_relevant=True - ) # Call `POST` `/annotate/url` with finished annotation, and receive next URL - request_info_2 = ath.request_validator.post_relevance_annotation_and_get_next( - metadata_id=inner_info_1.metadata_id, - relevance_annotation_post_info=post_info + request_info_2 = submit_and_get_next_function( + inner_info_1.metadata_id, + post_info ) inner_info_2 = request_info_2.next_annotation # Confirm 2nd URL is distinct from 1st @@ -68,12 +75,38 @@ async def test_annotate(api_test_helper): ) assert len(results) == 1 assert results[0].user_id == MOCK_USER_ID - assert results[0].value == "True" + assert results[0].value == expected_metadata_value # Submit this one in turn, and no subsequent annotation info should be returned - request_info_3 = ath.request_validator.post_relevance_annotation_and_get_next( - metadata_id=inner_info_2.metadata_id, - relevance_annotation_post_info=post_info + request_info_3 = submit_and_get_next_function( + inner_info_2.metadata_id, + post_info + ) + + assert request_info_3.next_annotation is None + +@pytest.mark.asyncio +async def test_annotate_relevancy(api_test_helper): + await run_annotation_test( + api_test_helper=api_test_helper, + submit_and_get_next_function=api_test_helper.request_validator.post_relevance_annotation_and_get_next, + get_next_function=api_test_helper.request_validator.get_next_relevance_annotation, + post_info=RelevanceAnnotationPostInfo( + is_relevant=True + ), + metadata_attribute=URLMetadataAttributeType.RELEVANT, + expected_metadata_value="True" ) - assert request_info_3.next_annotation is None \ No newline at end of file +@pytest.mark.asyncio +async def test_annotate_record_type(api_test_helper): + await run_annotation_test( + api_test_helper=api_test_helper, + submit_and_get_next_function=api_test_helper.request_validator.post_record_type_annotation_and_get_next, + get_next_function=api_test_helper.request_validator.get_next_record_type_annotation, + post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.ACCIDENT_REPORTS + ), + metadata_attribute=URLMetadataAttributeType.RECORD_TYPE, + expected_metadata_value=RecordType.ACCIDENT_REPORTS.value + )