Implement embedding extraction for LargeRecordingAnalyzer (#122)

joeweiss · web-flow · commit 777c7382f8b0 · 2024-12-05T16:28:55.000-05:00
* Implement embedding extraction for LargeRecordingAnalyzer
* Ensure the tests use the expected release of BirdNET for cmdline comparison
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -35,7 +35,8 @@ jobs:
           # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
       - name: Test with pytest
         run: |
-          pytest -m "not omit_during_ghactions"
+          git -C tests/BirdNET-Analyzer checkout 98945574c68102ccfac6c3504fcc63e64ed6f9e3
+          pytest -m "not omit_during_ghactions" --maxfail=1
   deploy:
     runs-on: ubuntu-latest
     needs: [test]
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -37,4 +37,5 @@ jobs:
           # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
       - name: Test with pytest
         run: |
-          pytest -m "not omit_during_ghactions"
+          git -C tests/BirdNET-Analyzer checkout 98945574c68102ccfac6c3504fcc63e64ed6f9e3
+          pytest -m "not omit_during_ghactions" --maxfail=1
diff --git a/src/birdnetlib/analyzer.py b/src/birdnetlib/analyzer.py
@@ -568,3 +568,27 @@ def analyze_recording(self, recording):
 
         self.results = results
         recording.detection_list = self.detections
+
+    def extract_embeddings_for_recording(self, recording):
+        print("extract_embeddings_for_recording", recording.filename)
+        start = 0
+        end = recording.sample_secs
+        results = []
+        for segment in read_audio_segments(recording.path, sr=48000):
+            c = segment["segment"]
+            if len(c) < recording.sample_secs * 48000:
+                # If below the minimum segment duration, continue.
+                del c
+                continue
+            start = segment["start_sec"]
+            end = segment["end_sec"]
+
+            data = np.array([c], dtype="float32")
+            e = self._return_embeddings(data)[0].tolist()
+            results.append({"start_time": start, "end_time": end, "embeddings": e})
+
+            # Increment start and end
+            start += recording.sample_secs - recording.overlap
+            end = start + recording.sample_secs
+
+        self.embeddings = results
diff --git a/src/birdnetlib/main.py b/src/birdnetlib/main.py
@@ -73,12 +73,6 @@ def analyze(self):
         self.analyzed = True
 
     def extract_embeddings(self):
-        # Check that analyzer is not LargeRecordingAnalyzer
-        if isinstance(self.analyzer, LargeRecordingAnalyzer):
-            raise IncompatibleAnalyzerError(
-                "LargeRecordingAnalyzer can only be used with the LargeRecording class"
-            )
-
         # Read and analyze.
         self.read_audio_data()
         self.analyzer.extract_embeddings_for_recording(self)
@@ -480,9 +474,9 @@ def analyze(self):
         self.analyzed = True
 
     def extract_embeddings(self):
-        raise NotImplementedError(
-            "Extraction of embeddings is not yet implemented for LargeRecordingAnalyzer. Use Analyzer if possible."
-        )
+        self.analyzer.extract_embeddings_for_recording(self)
+        self.embeddings_list = self.analyzer.embeddings
+        self.embeddings_extracted = True
 
     def get_extract_array(self, start_sec, end_sec):
         # Returns ndarray trimmed for start_sec:end_sec
diff --git a/tests/test_analyzer_versioning.py b/tests/test_analyzer_versioning.py
@@ -27,7 +27,9 @@ def run_before_and_after_tests():
     clean_up_temp_installed_versions()
     # Restore main branch for BirdNET-Analyzer to origin/main.
     birdnet_analyzer_path = os.path.join(os.path.dirname(__file__), "BirdNET-Analyzer")
-    os.system(f"cd {birdnet_analyzer_path}; git clean -fd; git switch main; git status")
+    os.system(
+        f"cd {birdnet_analyzer_path}; git clean -fd; git checkout 98945574c68102ccfac6c3504fcc63e64ed6f9e3; git status"
+    )
 
 
 def clean_up_temp_installed_versions():
diff --git a/tests/test_buffer_analyzer.py b/tests/test_buffer_analyzer.py
@@ -10,8 +10,10 @@
 from unittest.mock import patch
 import io
 
-def test_without_species_list():
+TEST_BN_COMMIT = "98945574c68102ccfac6c3504fcc63e64ed6f9e3"
+
 
+def test_without_species_list():
     # Process file with command line utility, then process with python library and ensure equal commandline_results.
 
     lon = -120.7463
@@ -24,6 +26,9 @@ def test_without_species_list():
 
     # Process using python script as is.
     birdnet_analyzer_path = os.path.join(os.path.dirname(__file__), "BirdNET-Analyzer")
+    os.system(
+        f"cd {birdnet_analyzer_path}; git clean -fd; git checkout {TEST_BN_COMMIT}; git status"
+    )
 
     cmd = f"python analyze.py --i '{input_path}' --o={output_path} --lat {lat} --lon {lon} --week {week_48} --min_conf {min_conf} --rtype=csv"
     print(cmd)
@@ -50,10 +55,10 @@ def test_without_species_list():
 
     # pprint(commandline_results)
     assert len(commandline_results) > 0
-    with open(input_path,'rb') as f:
+    with open(input_path, "rb") as f:
         wav_buffer = f.read()
     bytes_buffer = io.BytesIO(wav_buffer)
-    for rate,buffer in wavutils.bufferwavs(bytes_buffer):
+    for rate, buffer in wavutils.bufferwavs(bytes_buffer):
         analyzer = Analyzer()
         recording = RecordingBuffer(
             analyzer,
@@ -78,7 +83,6 @@ def test_without_species_list():
 
 
 def test_with_species_list_path():
-
     # Process file with command line utility, then process with python library and ensure equal commandline_results.
 
     lon = -120.7463
@@ -95,6 +99,9 @@ def test_with_species_list_path():
 
     # Process using python script as is.
     birdnet_analyzer_path = os.path.join(os.path.dirname(__file__), "BirdNET-Analyzer")
+    os.system(
+        f"cd {birdnet_analyzer_path}; git clean -fd; git checkout {TEST_BN_COMMIT}; git status"
+    )
 
     cmd = f"python analyze.py --i '{input_path}' --o={output_path} --min_conf {min_conf} --slist {custom_list_path} --rtype=csv"
     os.system(f"cd {birdnet_analyzer_path}; {cmd}")
@@ -120,10 +127,10 @@ def test_with_species_list_path():
 
     pprint(commandline_results)
     assert len(commandline_results) > 0
-    with open(input_path,'rb') as f:
+    with open(input_path, "rb") as f:
         wav_buffer = f.read()
     bytes_buffer = io.BytesIO(wav_buffer)
-    for rate,buffer in wavutils.bufferwavs(bytes_buffer):
+    for rate, buffer in wavutils.bufferwavs(bytes_buffer):
         analyzer = Analyzer(custom_species_list_path=custom_list_path)
         recording = RecordingBuffer(
             analyzer,
@@ -139,7 +146,8 @@ def test_with_species_list_path():
         pprint(recording.detections)
 
         assert (
-            commandline_results[0]["common_name"] == recording.detections[0]["common_name"]
+            commandline_results[0]["common_name"]
+            == recording.detections[0]["common_name"]
         )
 
         commandline_birds = [i["common_name"] for i in commandline_results]
@@ -151,11 +159,11 @@ def test_with_species_list_path():
             len(analyzer.custom_species_list) == 41
         )  # Check that this matches the number printed by the cli version.
 
-    with open(input_path,'rb') as f:
+    with open(input_path, "rb") as f:
         wav_buffer = f.read()
     bytes_buffer = io.BytesIO(wav_buffer)
     # Run a recording without path and throw an error when used with custom species list.
-    for rate,buffer in wavutils.bufferwavs(bytes_buffer):
+    for rate, buffer in wavutils.bufferwavs(bytes_buffer):
         with pytest.raises(ValueError):
             recording = RecordingBuffer(
                 analyzer,
@@ -170,7 +178,6 @@ def test_with_species_list_path():
 
 
 def test_with_species_list():
-
     # Process file with command line utility, then process with python library and ensure equal commandline_results.
 
     lon = -120.7463
@@ -187,6 +194,9 @@ def test_with_species_list():
 
     # Process using python script as is.
     birdnet_analyzer_path = os.path.join(os.path.dirname(__file__), "BirdNET-Analyzer")
+    os.system(
+        f"cd {birdnet_analyzer_path}; git clean -fd; git checkout {TEST_BN_COMMIT}; git status"
+    )
 
     cmd = f"python analyze.py --i '{input_path}' --o={output_path} --min_conf {min_conf} --slist {custom_list_path} --rtype=csv"
     os.system(f"cd {birdnet_analyzer_path}; {cmd}")
@@ -257,11 +267,11 @@ def test_with_species_list():
         "Zonotrichia albicollis_White-throated Sparrow",
     ]
 
-    with open(input_path,'rb') as f:
+    with open(input_path, "rb") as f:
         wav_buffer = f.read()
     bytes_buffer = io.BytesIO(wav_buffer)
     # Run a recording without path and throw an error when used with custom species list.
-    for rate,buffer in wavutils.bufferwavs(bytes_buffer):
+    for rate, buffer in wavutils.bufferwavs(bytes_buffer):
         analyzer = Analyzer(custom_species_list=custom_species_list)
         recording = RecordingBuffer(
             analyzer,
@@ -277,7 +287,8 @@ def test_with_species_list():
         pprint(recording.detections)
 
         assert (
-            commandline_results[0]["common_name"] == recording.detections[0]["common_name"]
+            commandline_results[0]["common_name"]
+            == recording.detections[0]["common_name"]
         )
 
         commandline_birds = [i["common_name"] for i in commandline_results]
@@ -290,11 +301,11 @@ def test_with_species_list():
         )  # Check that this matches the number printed by the cli version.
 
     # Run a recording with lat/lon and throw an error when used with custom species list.
-    with open(input_path,'rb') as f:
+    with open(input_path, "rb") as f:
         wav_buffer = f.read()
     bytes_buffer = io.BytesIO(wav_buffer)
     # Run a recording without path and throw an error when used with custom species list.
-    for rate,buffer in wavutils.bufferwavs(bytes_buffer):
+    for rate, buffer in wavutils.bufferwavs(bytes_buffer):
         with pytest.raises(ValueError):
             recording = RecordingBuffer(
                 analyzer,
@@ -309,7 +320,6 @@ def test_with_species_list():
 
 
 def test_species_list_calls():
-
     lon = -120.7463
     lat = 35.4244
     week_48 = 18
@@ -324,11 +334,11 @@ def test_species_list_calls():
         "return_predicted_species_list",
         wraps=analyzer.return_predicted_species_list,
     ) as wrapped_return_predicted_species_list:
-        with open(input_path,'rb') as f:
+        with open(input_path, "rb") as f:
             wav_buffer = f.read()
         bytes_buffer = io.BytesIO(wav_buffer)
         # Run a recording without path and throw an error when used with custom species list.
-        for rate,buffer in wavutils.bufferwavs(bytes_buffer):
+        for rate, buffer in wavutils.bufferwavs(bytes_buffer):
             recording = RecordingBuffer(
                 analyzer,
                 buffer,
@@ -342,10 +352,10 @@ def test_species_list_calls():
             assert wrapped_return_predicted_species_list.call_count == 1
 
         # Second recording with the same position/time should not regerate the species list.
-        with open(input_path,'rb') as f:
+        with open(input_path, "rb") as f:
             wav_buffer = f.read()
         bytes_buffer = io.BytesIO(wav_buffer)
-        for rate,buffer in wavutils.bufferwavs(bytes_buffer):
+        for rate, buffer in wavutils.bufferwavs(bytes_buffer):
             recording = RecordingBuffer(
                 analyzer,
                 buffer,
diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py
@@ -10,6 +10,7 @@
 import numpy as np
 
 
+@pytest.mark.omit_during_ghactions
 def test_embeddings():
     # Process file with command line utility, then process with python library and ensure equal commandline_results.
 
@@ -68,6 +69,7 @@ def test_embeddings():
         )
 
 
+@pytest.mark.omit_during_ghactions
 def test_largefile_embeddings():
     # Process file with command line utility, then process with python library and ensure equal commandline_results.
 
@@ -101,8 +103,6 @@ def test_largefile_embeddings():
     # pprint(commandline_results)
     assert len(commandline_results) == 40
 
-    # TODO: Implement for LargeRecording.
-    # Confirm that LargeRecording return not implemented.
     large_analyzer = LargeRecordingAnalyzer()
     recording = LargeRecording(
         large_analyzer,
@@ -113,6 +113,16 @@ def test_largefile_embeddings():
         min_conf=min_conf,
         return_all_detections=True,
     )
-    msg = "Extraction of embeddings is not yet implemented for LargeRecordingAnalyzer. Use Analyzer if possible."
-    with pytest.raises(NotImplementedError, match=msg):
-        recording.extract_embeddings()
+
+    recording.extract_embeddings()
+
+    # Check that birdnetlib results match command line results.
+    assert len(recording.embeddings) == 40
+    for idx, i in enumerate(commandline_results):
+        # Specify the tolerance level
+        tolerance = 1e-4  # 4 decimal points tolerance between BirdNET and birdnetlib.
+
+        # Assert that the arrays are almost equal within the tolerance
+        assert np.allclose(
+            i["embeddings"], recording.embeddings[idx]["embeddings"], atol=tolerance
+        )