From 42a0ff2b628ecfc2e8e0f13cfaeed0bc86b516bf Mon Sep 17 00:00:00 2001 From: Davide Polato Date: Thu, 2 Apr 2026 09:41:43 +0200 Subject: [PATCH 1/4] Add scaffolding for stormcrawler-opensearch-java module - Cloned external/opensearch to external/opensearch-java to introduce the new client as a drop-in replacement. - Updated Maven artifactId and names in the new local POMs (including the archetype). - Registered the new module in the root POM. This commit isolates the pure file duplication. The actual migration to the opensearch-java client will be done in the next commit to ensure a clean, readable Git diff for reviewers. --- external/opensearch-java/README.md | 70 +++ external/opensearch-java/archetype/pom.xml | 72 +++ .../META-INF/archetype-post-generate.groovy | 21 + .../META-INF/maven/archetype-metadata.xml | 72 +++ .../archetype-resources/OS_IndexInit.sh | 40 ++ .../resources/archetype-resources/README.md | 80 +++ .../archetype-resources/crawler-conf.yaml | 160 ++++++ .../archetype-resources/crawler.flux | 141 +++++ .../dashboards/importDashboards.sh | 29 ++ .../dashboards/metrics.ndjson | 10 + .../dashboards/status.ndjson | 5 + .../dashboards/storm.ndjson | 5 + .../archetype-resources/docker-compose.yml | 81 +++ .../archetype-resources/injection.flux | 50 ++ .../archetype-resources/opensearch-conf.yaml | 115 ++++ .../resources/archetype-resources/pom.xml | 149 ++++++ .../main/resources/default-regex-filters.txt | 32 ++ .../resources/default-regex-normalizers.xml | 78 +++ .../src/main/resources/indexer.mapping | 40 ++ .../src/main/resources/jsoupfilters.json | 27 + .../src/main/resources/metrics.mapping | 40 ++ .../src/main/resources/parsefilters.json | 23 + .../src/main/resources/status.mapping | 39 ++ .../src/main/resources/urlfilters.json | 60 +++ .../dashboards/importDashboards.sh | 29 ++ .../opensearch-java/dashboards/metrics.ndjson | 10 + .../opensearch-java/dashboards/status.ndjson | 5 + .../opensearch-java/dashboards/storm.ndjson | 5 + external/opensearch-java/opensearch-conf.yaml | 128 +++++ external/opensearch-java/pom.xml | 121 +++++ .../BulkItemResponseToFailedFlag.java | 134 +++++ .../stormcrawler/opensearch/Constants.java | 23 + .../opensearch/IndexCreation.java | 116 +++++ .../opensearch/OpenSearchConnection.java | 349 +++++++++++++ .../opensearch/bolt/DeletionBolt.java | 318 ++++++++++++ .../opensearch/bolt/IndexerBolt.java | 473 +++++++++++++++++ .../filtering/JSONURLFilterWrapper.java | 175 +++++++ .../opensearch/metrics/MetricsConsumer.java | 164 ++++++ .../opensearch/metrics/StatusMetricsBolt.java | 169 ++++++ .../parse/filter/JSONResourceWrapper.java | 171 ++++++ .../opensearch/persistence/AbstractSpout.java | 236 +++++++++ .../persistence/AggregationSpout.java | 373 +++++++++++++ .../opensearch/persistence/HybridSpout.java | 227 ++++++++ .../persistence/StatusUpdaterBolt.java | 490 ++++++++++++++++++ .../bolt/AbstractOpenSearchTest.java | 52 ++ .../opensearch/bolt/IndexerBoltTest.java | 138 +++++ .../opensearch/bolt/StatusBoltTest.java | 149 ++++++ .../src/test/resources/indexer.mapping | 40 ++ .../src/test/resources/metrics.mapping | 40 ++ .../src/test/resources/status.mapping | 39 ++ pom.xml | 2 + 51 files changed, 5615 insertions(+) create mode 100644 external/opensearch-java/README.md create mode 100644 external/opensearch-java/archetype/pom.xml create mode 100644 external/opensearch-java/archetype/src/main/resources/META-INF/archetype-post-generate.groovy create mode 100644 external/opensearch-java/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml create mode 100755 external/opensearch-java/archetype/src/main/resources/archetype-resources/OS_IndexInit.sh create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/README.md create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler-conf.yaml create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler.flux create mode 100755 external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/importDashboards.sh create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/metrics.ndjson create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/status.ndjson create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/storm.ndjson create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/docker-compose.yml create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/injection.flux create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/pom.xml create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/indexer.mapping create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/metrics.mapping create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/status.mapping create mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json create mode 100755 external/opensearch-java/dashboards/importDashboards.sh create mode 100644 external/opensearch-java/dashboards/metrics.ndjson create mode 100644 external/opensearch-java/dashboards/status.ndjson create mode 100644 external/opensearch-java/dashboards/storm.ndjson create mode 100644 external/opensearch-java/opensearch-conf.yaml create mode 100644 external/opensearch-java/pom.xml create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/BulkItemResponseToFailedFlag.java create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/Constants.java create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/IndexCreation.java create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/filtering/JSONURLFilterWrapper.java create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AbstractSpout.java create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/HybridSpout.java create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java create mode 100644 external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/AbstractOpenSearchTest.java create mode 100644 external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/IndexerBoltTest.java create mode 100644 external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java create mode 100644 external/opensearch-java/src/test/resources/indexer.mapping create mode 100644 external/opensearch-java/src/test/resources/metrics.mapping create mode 100644 external/opensearch-java/src/test/resources/status.mapping diff --git a/external/opensearch-java/README.md b/external/opensearch-java/README.md new file mode 100644 index 000000000..159bb29b6 --- /dev/null +++ b/external/opensearch-java/README.md @@ -0,0 +1,70 @@ +stormcrawler-opensearch +=========================== + +A collection of resources for [OpenSearch](https://opensearch.org/): +* [IndexerBolt](https://github.com/apache/stormcrawler/blob/master/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java) for indexing documents crawled with StormCrawler +* [Spouts](https://github.com/apache/stormcrawler/blob/master/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java) and [StatusUpdaterBolt](https://github.com/apache/stormcrawler/blob/master/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java) for persisting URL information in recursive crawls +* [MetricsConsumer](https://github.com/apache/stormcrawler/blob/master/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java) +* [StatusMetricsBolt](https://github.com/apache/stormcrawler/blob/master/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java) for sending the breakdown of URLs per status as metrics and display its evolution over time. + +as well as resources for building basic real-time monitoring dashboards for the crawls, see below. + +This module is ported from the Elasticsearch one. + +Getting started +--------------------- + +The easiest way is currently to use the archetype for OpenSearch with: + +`mvn archetype:generate -DarchetypeGroupId=org.apache.stormcrawler -DarchetypeArtifactId=stormcrawler-opensearch-archetype -DarchetypeVersion=3.4.0` + +You'll be asked to enter a groupId (e.g. com.mycompany.crawler), an artefactId (e.g. stormcrawler), a version, a package name and details about the user agent to use. + +This will not only create a fully formed project containing a POM with the dependency above but also a set of resources, configuration files and a topology class. Enter the directory you just created (should be the same as the artefactId you specified earlier) and follow the instructions on the README file. + +You will of course need to have both Storm and OpenSearch installed. For the latter, the [OpenSearch documentation](https://opensearch.org/docs/latest/install-and-configure/install-opensearch/docker/) contains resources for Docker. + +Unlike in the Elastic module, the schemas are automatically created by the bolts. You can of course override them by using the script 'OS_IndexInit.sh' generated by the archetype, the index definitions are located in _src/main/resources_. + + +Dashboards +--------------------- + +To import the dashboards into a local instance of OpenSearch Dashboard, go into the folder _dashboards_ and run the script _importDashboards.sh_. + +You should see something like + +``` +Importing status dashboard into OpenSearch Dashboards +{"successCount":4,"success":true,"successResults":[{"type":"index-pattern","id":"7445c390-7339-11e9-9289-ffa3ee6775e4","meta":{"title":"status","icon":"indexPatternApp"}},{"type":"visualization","id":"status-count","meta":{"title":"status count","icon":"visualizeApp"}},{"type":"visualization","id":"Top-Hosts","meta":{"title":"Top Hosts","icon":"visualizeApp"}},{"type":"dashboard","id":"Crawl-status","meta":{"title":"Crawl status","icon":"dashboardApp"}}]} +Importing metrics dashboard into OpenSearch Dashboards +{"successCount":9,"success":true,"successResults":[{"type":"index-pattern","id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","meta":{"title":"metrics","icon":"indexPatternApp"}},{"type":"visualization","id":"Fetcher-:-#-active-threads","meta":{"title":"Fetcher : # active threads","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-num-queues","meta":{"title":"Fetcher : num queues","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-pages-fetched","meta":{"title":"Fetcher : pages fetched","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-URLs-waiting-in-queues","meta":{"title":"Fetcher : URLs waiting in queues","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-average-bytes-per-second","meta":{"title":"Fetcher : average bytes per second","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-average-pages-per-second","meta":{"title":"Fetcher : average pages per second","icon":"visualizeApp"}},{"type":"visualization","id":"Total-bytes-fetched","meta":{"title":"Total bytes fetched","icon":"visualizeApp"}},{"type":"dashboard","id":"Crawl-metrics","meta":{"title":"Crawl metrics","icon":"dashboardApp"}}]} + +``` + +The [dashboard screen](http://localhost:5601/app/dashboards#/list?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-15m,to:now))) should show both the status and metrics dashboards. If you click on `Crawl Status`, you should see 2 tables containing the count of URLs per status and the top hostnames per URL count. +The [Metrics dashboard](http://localhost:5601/app/dashboards#/view/Crawl-metrics) can be used to monitor the progress of the crawl. + +The file _storm.ndjson_ is used to display some of Storm's internal metrics and is not added by default. + +#### Per time period metric indices (optional) + +The _metrics_ index can be configured per time period. This best practice is [discussed on the Elastic website](https://www.elastic.co/guide/en/elasticsearch/guide/current/time-based.html). + +The crawler config YAML must be updated to use an optional argument as shown below to have one index per day: + +``` + #Metrics consumers: + topology.metrics.consumer.register: + - class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer" + parallelism.hint: 1 + argument: "yyyy-MM-dd" +``` + + + + + + + + diff --git a/external/opensearch-java/archetype/pom.xml b/external/opensearch-java/archetype/pom.xml new file mode 100644 index 000000000..10b4090de --- /dev/null +++ b/external/opensearch-java/archetype/pom.xml @@ -0,0 +1,72 @@ + + + + + + 4.0.0 + + + org.apache.stormcrawler + stormcrawler + 3.5.2-SNAPSHOT + ../../../pom.xml + + + stormcrawler-opensearch-java-archetype + + maven-archetype + + + + + + src/main/resources + true + + META-INF/maven/archetype-metadata.xml + + + + src/main/resources + false + + META-INF/maven/archetype-metadata.xml + + + + + + + org.apache.maven.archetype + archetype-packaging + 3.4.1 + + + + + + + maven-archetype-plugin + 3.4.1 + + + + + diff --git a/external/opensearch-java/archetype/src/main/resources/META-INF/archetype-post-generate.groovy b/external/opensearch-java/archetype/src/main/resources/META-INF/archetype-post-generate.groovy new file mode 100644 index 000000000..bbdb54974 --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/META-INF/archetype-post-generate.groovy @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +def file1 = new File(request.getOutputDirectory(), request.getArtifactId() + "/dashboards/importDashboards.sh") +file1.setExecutable(true, false) + +def file2 = new File(request.getOutputDirectory(), request.getArtifactId() + "/OS_IndexInit.sh") +file2.setExecutable(true, false) diff --git a/external/opensearch-java/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml b/external/opensearch-java/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml new file mode 100644 index 000000000..4f58adcd6 --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml @@ -0,0 +1,72 @@ + + + + + + + + + ^[a-zA-Z_\-]+$ + + + + + + ^\S+@\S+\.\S+$ + + + ${project.version} + + + + + + src/main/resources + + **/*.xml + **/*.txt + **/*.yaml + **/*.json + **/*.mapping + + + + + + README.md + *.flux + *.yaml + *.sh + + + + dashboards + + *.sh + *.ndjson + + + + + diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/OS_IndexInit.sh b/external/opensearch-java/archetype/src/main/resources/archetype-resources/OS_IndexInit.sh new file mode 100755 index 000000000..69698c1a8 --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/OS_IndexInit.sh @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash + +OSHOST=${1:-"http://localhost:9200"} +OSCREDENTIALS=${2:-"-u opensearch:passwordhere"} + +curl $OSCREDENTIALS -s -XDELETE "$OSHOST/status/" > /dev/null +echo "Deleted 'status' index, now recreating it..." +curl $OSCREDENTIALS -s -XPUT "$OSHOST/status" -H 'Content-Type: application/json' --upload-file src/main/resources/status.mapping + +echo "" + +curl $OSCREDENTIALS -s -XDELETE "$OSHOST/content/" > /dev/null +echo "Deleted 'content' index, now recreating it..." +curl $OSCREDENTIALS -s -XPUT "$OSHOST/content" -H 'Content-Type: application/json' --upload-file src/main/resources/indexer.mapping + +### metrics + +curl $OSCREDENTIALS -s -XDELETE "$OSHOST/metrics*/" > /dev/null + +echo "Deleted 'metrics' index, now recreating it..." + +# http://localhost:9200/metrics/_mapping/status?pretty +curl $OSCREDENTIALS -s -XPOST "$OSHOST/_template/metrics-template" -H 'Content-Type: application/json' --upload-file src/main/resources/metrics.mapping + +echo "" diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/README.md b/external/opensearch-java/archetype/src/main/resources/archetype-resources/README.md new file mode 100644 index 000000000..ddd7be949 --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/README.md @@ -0,0 +1,80 @@ +This has been generated by the StormCrawler Maven Archetype as a starting point for building your own crawler with [OpenSearch](https://opensearch.org/) as a backend. +Have a look at the code and resources and modify them to your heart's content. + +# Prerequisites + +## Native +You need to have Apache Storm installed, as well as a running instance of OpenSearch. + +## Docker Compose + +We provide a simple `docker-compose.yaml` file to launch OpenSearch, Zookeeper, Storm Nimbus, Storm Supervisor, and the Storm UI. +You may need to update `opensearch-conf.yaml` to reference the OpenSearch host configuration (Docker container name). + +# Compilation + +First generate an uberjar: + +``` sh +mvn clean package +``` + +# URL injection + +The first step consists in creating a file _seeds.txt_ in the current directory and populating it with the URLs +to be used as a starting point for the crawl, e.g. + +`echo "http://stormcrawler.net/" > seeds.txt` + +You can start the crawl topology in local mode using the URLs in _seeds.txt_ as a starting point with + +``` sh +storm local target/${artifactId}-${version}.jar org.apache.storm.flux.Flux injection.flux --local-ttl 3600 +``` + +Note that in local mode, Flux uses a default TTL for the topology of 20 secs. The command above runs the topology for 1 hour. + +# Running the crawl + +To start crawling, run the following command + +``` sh +storm jar target/${artifactId}-${version}.jar org.apache.storm.flux.Flux crawler.flux +``` + +Note that in the previous command, we ran the topology with `storm jar` to benefit from the Storm UI and logging. In that case, the topology runs continuously, as intended. +If you don't have a Storm cluster set up and/or want to run in local mode, simply replace _jar_ with _local_ and add _--local-ttl 3600_. + + +Index definitions +--------------------- + +Unlike in the Elastic module, the schemas are automatically created by the bolts. You can of course override them by using the script 'OS_IndexInit.sh', the index definitions are located in _src/main/resources_. + + +Dashboards +--------------------- + +To import the dashboards into a local instance of OpenSearch Dashboards, go into the folder _dashboards_ and run the script _importDashboards.sh_. + +You should see something like + +``` +Importing status dashboard into OpenSearch Dashboards +{"successCount":4,"success":true,"successResults":[{"type":"index-pattern","id":"7445c390-7339-11e9-9289-ffa3ee6775e4","meta":{"title":"status","icon":"indexPatternApp"}},{"type":"visualization","id":"status-count","meta":{"title":"status count","icon":"visualizeApp"}},{"type":"visualization","id":"Top-Hosts","meta":{"title":"Top Hosts","icon":"visualizeApp"}},{"type":"dashboard","id":"Crawl-status","meta":{"title":"Crawl status","icon":"dashboardApp"}}]} +Importing metrics dashboard into OpenSearch Dashboards +{"successCount":9,"success":true,"successResults":[{"type":"index-pattern","id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","meta":{"title":"metrics","icon":"indexPatternApp"}},{"type":"visualization","id":"Fetcher-:-#-active-threads","meta":{"title":"Fetcher : # active threads","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-num-queues","meta":{"title":"Fetcher : num queues","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-pages-fetched","meta":{"title":"Fetcher : pages fetched","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-URLs-waiting-in-queues","meta":{"title":"Fetcher : URLs waiting in queues","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-average-bytes-per-second","meta":{"title":"Fetcher : average bytes per second","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-average-pages-per-second","meta":{"title":"Fetcher : average pages per second","icon":"visualizeApp"}},{"type":"visualization","id":"Total-bytes-fetched","meta":{"title":"Total bytes fetched","icon":"visualizeApp"}},{"type":"dashboard","id":"Crawl-metrics","meta":{"title":"Crawl metrics","icon":"dashboardApp"}}]} + +``` + +The [dashboard screen](http://localhost:5601/app/dashboards#/list?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-15m,to:now))) should show both the status and metrics dashboards. If you click on `Crawl Status`, you should see 2 tables containing the count of URLs per status and the top hostnames per URL count. +The [Metrics dashboard](http://localhost:5601/app/dashboards#/view/Crawl-metrics) can be used to monitor the progress of the crawl. + +The file _storm.ndjson_ is used to display some of Storm's internal metrics and is not added by default. + + + +Happy crawling! If you have any questions, please ask on [StackOverflow with the tag stormcrawler](http://stackoverflow.com/questions/tagged/stormcrawler) or the [discussions](https://github.com/apache/stormcrawler/discussions) section on GitHub. + + + diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler-conf.yaml b/external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler-conf.yaml new file mode 100644 index 000000000..f62103faf --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler-conf.yaml @@ -0,0 +1,160 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Custom configuration for StormCrawler +# This is used to override the default values from crawler-default.xml and provide additional ones +# for your custom components. +# Use this file with the parameter -conf when launching your extension of ConfigurableTopology. +# This file does not contain all the key values but only the most frequently used ones. See crawler-default.xml for an extensive list. + +config: + topology.workers: 1 + topology.message.timeout.secs: 300 + topology.max.spout.pending: 100 + topology.debug: false + + fetcher.threads.number: 50 + + # override the JVM parameters for the workers + topology.worker.childopts: "-Xmx2g -Djava.net.preferIPv4Stack=true" + + # mandatory when using Flux + topology.kryo.register: + - org.apache.stormcrawler.Metadata + - org.apache.stormcrawler.persistence.Status + + # Lists the metadata to transfer to outlinks + # Used by Fetcher and SiteMapParser for redirections, + # discovered links, passing cookies to child pages, etc. + # These are also persisted for the parent document (see below). + # Allows wildcards, eg. "follow.*" transfers all metadata starting with "follow.". + # metadata.transfer: + # - customMetadataName + + # Lists the metadata to persist to storage + # These are not transferred to the outlinks. Also allows wildcards, eg. "follow.*". + metadata.persist: + - _redirTo + - error.cause + - error.source + - isSitemap + - isFeed + + # Agent name info - given here as an example. Do not be an anonynmous coward, use your real information! + # The full user agent value sent as part of the HTTP requests + # is built from the elements below. Only the agent.name is mandatory, + # it is also used to parse the robots.txt directives. + + # The agent name must be compliant with RFC 9309 (section 2.2.1) + # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-") + http.agent.name: "${http-agent-name}" + # version of your crawler + http.agent.version: "${http-agent-version}" + # description of what it does + http.agent.description: "${http-agent-description}" + # URL webmasters can go to to learn about it + http.agent.url: "${http-agent-url}" + # Finally, an email so that they can get in touch with you + http.agent.email: "${http-agent-email}" + + http.protocol.implementation: "org.apache.stormcrawler.protocol.okhttp.HttpProtocol" + https.protocol.implementation: "org.apache.stormcrawler.protocol.okhttp.HttpProtocol" + + # The maximum number of bytes for returned HTTP response bodies. + # The fetched page will be trimmed to 65KB in this case + # Set -1 to disable the limit. + http.content.limit: 65536 + + sitemap.discovery: true + + # FetcherBolt queue dump => comment out to activate + # if a file exists on the worker machine with the corresponding port number + # the FetcherBolt will log the content of its internal queues to the logs + # fetcherbolt.queue.debug.filepath: "/tmp/fetcher-dump-{port}" + + parsefilters.config.file: "parsefilters.json" + urlfilters.config.file: "urlfilters.json" + jsoup.filters.config.file: "jsoupfilters.json" + + # revisit a page daily (value in minutes) + # set it to -1 to never refetch a page + fetchInterval.default: 1440 + + # revisit a page with a fetch error after 2 hours (value in minutes) + # set it to -1 to never refetch a page + fetchInterval.fetch.error: 120 + + # never revisit a page with an error (or set a value in minutes) + fetchInterval.error: -1 + + # set to true if you don't need any text to be extracted by JSoup + textextractor.no.text: false + + # text extraction for JSoupParserBolt + textextractor.include.pattern: + - DIV[id="maincontent"] + - DIV[itemprop="articleBody"] + - ARTICLE + + textextractor.exclude.tags: + - STYLE + - SCRIPT + + # needed for parsing with Tika + jsoup.treat.non.html.as.error: false + + # restricts the documents types to be parsed with Tika + parser.mimetype.whitelist: + - application/.+word.* + - application/.+excel.* + - application/.+powerpoint.* + - application/.*pdf.* + + # Tika parser configuration file + parse.tika.config.file: "tika-config.xml" + + # custom fetch interval to be used when a document has the key/value in its metadata + # and has been fetched successfully (value in minutes) + # fetchInterval.FETCH_ERROR.isFeed=true: 30 + # fetchInterval.isFeed=true: 10 + + # configuration for the classes extending AbstractIndexerBolt + # indexer.md.filter: "someKey=aValue" + indexer.url.fieldname: "url" + indexer.text.fieldname: "content" + indexer.canonical.name: "canonical" + # How to convert metadata key values into fields for indexing + # + # if no alias is specified with =alias, the key value is used + # for instance below, _domain_ and _format_ will be used + # as field names, whereas _title_ will be used for _parse.title_. + # You can specify the index of the value to store from the values array + # by using the _key[index]_ format, e.g. _parse.title[0]_ would try to + # get the first value for the metadata _parse.title_ (which is the default anyway). + # Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would + # index all the keys with _parse_ as a prefix. Note that in that case, you can't + # specify an alias with =, nor can you specify an index. + indexer.md.mapping: + - parse.title=title + - parse.keywords=keywords + - parse.description=description + - domain + - format + + # Metrics consumers: + topology.metrics.consumer.register: + - class: "org.apache.storm.metric.LoggingMetricsConsumer" + parallelism.hint: 1 + diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler.flux b/external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler.flux new file mode 100644 index 000000000..85fb6c655 --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler.flux @@ -0,0 +1,141 @@ +name: "crawler" + +includes: + - resource: true + file: "/crawler-default.yaml" + override: false + + - resource: false + file: "crawler-conf.yaml" + override: true + + - resource: false + file: "opensearch-conf.yaml" + override: true + +spouts: + - id: "spout" + className: "org.apache.stormcrawler.opensearch.persistence.AggregationSpout" + parallelism: 10 + +bolts: + - id: "partitioner" + className: "org.apache.stormcrawler.bolt.URLPartitionerBolt" + parallelism: 1 + - id: "fetcher" + className: "org.apache.stormcrawler.bolt.FetcherBolt" + parallelism: 1 + - id: "sitemap" + className: "org.apache.stormcrawler.bolt.SiteMapParserBolt" + parallelism: 1 + - id: "parse" + className: "org.apache.stormcrawler.bolt.JSoupParserBolt" + parallelism: 1 + - id: "shunt" + className: "org.apache.stormcrawler.tika.RedirectionBolt" + parallelism: 1 + - id: "tika" + className: "org.apache.stormcrawler.tika.ParserBolt" + parallelism: 1 + - id: "index" + className: "org.apache.stormcrawler.opensearch.bolt.IndexerBolt" + parallelism: 1 + - id: "status" + className: "org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt" + parallelism: 1 + - id: "deleter" + className: "org.apache.stormcrawler.opensearch.bolt.DeletionBolt" + parallelism: 1 + - id: "status_metrics" + className: "org.apache.stormcrawler.opensearch.metrics.StatusMetricsBolt" + parallelism: 1 + +streams: + - from: "spout" + to: "partitioner" + grouping: + type: SHUFFLE + + - from: "__system" + to: "status_metrics" + grouping: + type: SHUFFLE + streamId: "__tick" + + - from: "partitioner" + to: "fetcher" + grouping: + type: FIELDS + args: ["key"] + + - from: "fetcher" + to: "sitemap" + grouping: + type: LOCAL_OR_SHUFFLE + + - from: "sitemap" + to: "parse" + grouping: + type: LOCAL_OR_SHUFFLE + + - from: "parse" + to: "shunt" + grouping: + type: LOCAL_OR_SHUFFLE + + - from: "shunt" + to: "tika" + grouping: + type: LOCAL_OR_SHUFFLE + streamId: "tika" + + - from: "tika" + to: "index" + grouping: + type: LOCAL_OR_SHUFFLE + + - from: "shunt" + to: "index" + grouping: + type: LOCAL_OR_SHUFFLE + + - from: "fetcher" + to: "status" + grouping: + type: FIELDS + args: ["url"] + streamId: "status" + + - from: "sitemap" + to: "status" + grouping: + type: FIELDS + args: ["url"] + streamId: "status" + + - from: "parse" + to: "status" + grouping: + type: FIELDS + args: ["url"] + streamId: "status" + + - from: "tika" + to: "status" + grouping: + type: FIELDS + args: ["url"] + streamId: "status" + + - from: "index" + to: "status" + grouping: + type: FIELDS + args: ["url"] + streamId: "status" + + - from: "status" + to: "deleter" + grouping: + type: LOCAL_OR_SHUFFLE + streamId: "deletion" diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/importDashboards.sh b/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/importDashboards.sh new file mode 100755 index 000000000..561f739c1 --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/importDashboards.sh @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/sh + +BIN=$(dirname $0) + +echo "Importing status dashboard into OpenSearch Dashboards" +curl -X POST "localhost:5601/api/saved_objects/_import" -H "osd-xsrf: true" --form file=@$BIN/status.ndjson +echo "" + +echo "Importing metrics dashboard into OpenSearch Dashboards" +curl -X POST "localhost:5601/api/saved_objects/_import" -H "osd-xsrf: true" --form file=@$BIN/metrics.ndjson +echo "" + +# Storm internal metrics +# curl -X POST "localhost:5601/api/saved_objects/_import" -H "kbn-xsrf: true" --form file=@$BIN/storm.ndjson diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/metrics.ndjson b/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/metrics.ndjson new file mode 100644 index 000000000..20cbb2bc0 --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/metrics.ndjson @@ -0,0 +1,10 @@ +{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:activethreads\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : # active threads","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"2\"}}],\"listeners\":{},\"title\":\"Fetcher : # active threads\"}"},"id":"Fetcher-:-#-active-threads","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.178Z","version":"WzksMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:num_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : num queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : num queues\"}"},"id":"Fetcher-:-num-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.175Z","version":"WzgsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : pages fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : pages fetched\"}"},"id":"Fetcher-:-pages-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.170Z","version":"WzcsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:in_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : URLs waiting in queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"addLegend\":false,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"mode\":\"grouped\",\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"spyPerPage\":10,\"times\":[],\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"5\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"4\"}}],\"listeners\":{},\"title\":\"Fetcher : URLs waiting in queues\"}"},"id":"Fetcher-:-URLs-waiting-in-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.160Z","version":"WzUsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.bytes_fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average bytes per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}}],\"listeners\":{},\"title\":\"Fetcher : average bytes per second\"}"},"id":"Fetcher-:-average-bytes-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.173Z","version":"WzYsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average pages per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Fetcher : average pages per second\"}"},"id":"Fetcher-:-average-pages-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.820Z","version":"WzEwLDFd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.bytes_fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Total bytes fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"m\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Total bytes fetched\"}"},"id":"Total-bytes-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.823Z","version":"WzExLDFd"} +{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}},\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelIndex\":\"1\",\"gridData\":{\"x\":24,\"y\":20,\"w\":12,\"h\":12,\"i\":\"1\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_0\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":12,\"y\":20,\"w\":12,\"h\":12,\"i\":\"2\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_1\"},{\"panelIndex\":\"3\",\"gridData\":{\"x\":0,\"y\":0,\"w\":36,\"h\":12,\"i\":\"3\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_2\"},{\"panelIndex\":\"4\",\"gridData\":{\"x\":0,\"y\":20,\"w\":12,\"h\":12,\"i\":\"4\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_3\"},{\"panelIndex\":\"5\",\"gridData\":{\"x\":0,\"y\":40,\"w\":36,\"h\":8,\"i\":\"5\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_4\"},{\"panelIndex\":\"6\",\"gridData\":{\"x\":0,\"y\":32,\"w\":36,\"h\":8,\"i\":\"6\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_5\"},{\"panelIndex\":\"7\",\"gridData\":{\"x\":0,\"y\":12,\"w\":36,\"h\":8,\"i\":\"7\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_6\"}]","timeRestore":false,"title":"Crawl metrics","version":1},"id":"Crawl-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Fetcher-:-#-active-threads","name":"panel_0","type":"visualization"},{"id":"Fetcher-:-num-queues","name":"panel_1","type":"visualization"},{"id":"Fetcher-:-pages-fetched","name":"panel_2","type":"visualization"},{"id":"Fetcher-:-URLs-waiting-in-queues","name":"panel_3","type":"visualization"},{"id":"Fetcher-:-average-bytes-per-second","name":"panel_4","type":"visualization"},{"id":"Fetcher-:-average-pages-per-second","name":"panel_5","type":"visualization"},{"id":"Total-bytes-fetched","name":"panel_6","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:06:58.830Z","version":"WzQsMV0="} +{"exportedCount":9,"missingRefCount":0,"missingReferences":[]} diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/status.ndjson b/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/status.ndjson new file mode 100644 index 000000000..b3d0122e4 --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/status.ndjson @@ -0,0 +1,5 @@ +{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"key\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"metadata._redirTo\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.depth\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Ecause\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Esource\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.fetch%2Eerror%2Ecount\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isFeed\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isSitemap\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.url%2Epath\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"nextFetchDate\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"status\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"url\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":true,\"readFromDocValues\":true}]","title":"status"},"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:07:47.130Z","version":"WzEzLDFd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"status count","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"status\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"status count\"}"},"id":"status-count","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.278Z","version":"WzE1LDFd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Top Hosts","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"key\",\"size\":50,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"Top Hosts\"}"},"id":"Top-Hosts","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.281Z","version":"WzE2LDFd"} +{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"37874bbf-6607-435a-a231-94d81e9193e7\",\"gridData\":{\"x\":0,\"y\":0,\"w\":16,\"h\":20,\"i\":\"37874bbf-6607-435a-a231-94d81e9193e7\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"4faa5b74-1660-44f7-9227-89d900c8231e\",\"gridData\":{\"x\":16,\"y\":0,\"w\":16,\"h\":20,\"i\":\"4faa5b74-1660-44f7-9227-89d900c8231e\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Crawl status","version":1},"id":"Crawl-status","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"status-count","name":"panel_0","type":"visualization"},{"id":"Top-Hosts","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:07:47.948Z","version":"WzE0LDFd"} +{"exportedCount":4,"missingRefCount":0,"missingReferences":[]} diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/storm.ndjson b/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/storm.ndjson new file mode 100644 index 000000000..1d25d1f6e --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/storm.ndjson @@ -0,0 +1,5 @@ +{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name: \\\"__receive.population\\\"\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Storm Receive Queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcTaskId\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcComponentId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}"},"id":"Storm-Receive-Queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.875Z","version":"WzIwLDFd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"+srcComponentId: \\\"__system\\\" +name: memory\\\\/heap*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Memory Heap","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":true,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"name\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"split\",\"params\":{\"field\":\"srcWorkerHost\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\",\"row\":true}}],\"listeners\":{}}"},"id":"Memory-Heap","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.877Z","version":"WzIxLDFd"} +{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\",\"gridData\":{\"x\":0,\"y\":0,\"w\":32,\"h\":8,\"i\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\",\"gridData\":{\"x\":0,\"y\":8,\"w\":32,\"h\":16,\"i\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Storm metrics","version":1},"id":"Storm-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Storm-Receive-Queues","name":"panel_0","type":"visualization"},{"id":"Memory-Heap","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:08:33.810Z","version":"WzE5LDFd"} +{"exportedCount":4,"missingRefCount":0,"missingReferences":[]} diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/docker-compose.yml b/external/opensearch-java/archetype/src/main/resources/archetype-resources/docker-compose.yml new file mode 100644 index 000000000..ccad3cc41 --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/docker-compose.yml @@ -0,0 +1,81 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +services: + zookeeper: + image: zookeeper:3.9.3 + container_name: zookeeper + restart: always + + nimbus: + image: storm:latest + container_name: nimbus + hostname: nimbus + command: storm nimbus + depends_on: + - zookeeper + restart: always + + supervisor: + image: storm:latest + container_name: supervisor + command: storm supervisor -c worker.childopts=-Xmx%HEAP-MEM%m + depends_on: + - nimbus + - zookeeper + restart: always + + ui: + image: storm:latest + container_name: ui + command: storm ui + depends_on: + - nimbus + restart: always + ports: + - "127.0.0.1:8080:8080" + + opensearch-sc: + image: opensearchproject/opensearch:2.19.4 + container_name: opensearch-sc + environment: + - cluster.name=opensearch-sc-cluster + - node.name=opensearch-sc + - discovery.type=single-node + - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping + - "OPENSEARCH_JAVA_OPTS=-Xms4G -Xmx4G" + - plugins.security.disabled=true + - "DISABLE_INSTALL_DEMO_CONFIG=true" + volumes: + - opensearch-sc-data:/usr/share/opensearch/data + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems + hard: 65536 + ports: + - "127.0.0.1:9200:9200" # REST API + + opensearch-dashboard: + image: opensearchproject/opensearch-dashboards:2.19.4 + container_name: dashboard + ports: + - "127.0.0.1:5601:5601" + expose: + - "5601" + environment: + - 'OPENSEARCH_HOSTS=["http://opensearch-sc:9200"]' + - "DISABLE_SECURITY_DASHBOARDS_PLUGIN=true" # disables security dashboards plugin in OpenSearch Dashboards diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/injection.flux b/external/opensearch-java/archetype/src/main/resources/archetype-resources/injection.flux new file mode 100644 index 000000000..060c1052f --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/injection.flux @@ -0,0 +1,50 @@ +name: "injection" + +includes: + - resource: true + file: "/crawler-default.yaml" + override: false + + - resource: false + file: "crawler-conf.yaml" + override: true + + - resource: false + file: "opensearch-conf.yaml" + override: true + +spouts: + - id: "filespout" + className: "org.apache.stormcrawler.spout.FileSpout" + parallelism: 1 + constructorArgs: + - "." + - "seeds.txt" + - true + +bolts: + - id: "filter" + className: "org.apache.stormcrawler.bolt.URLFilterBolt" + parallelism: 1 + + - id: "status" + className: "org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt" + parallelism: 1 + +streams: + - from: "filespout" + to: "filter" + grouping: + type: FIELDS + args: ["url"] + streamId: "status" + + - from: "filter" + to: "status" + grouping: + streamId: "status" + type: CUSTOM + customClass: + className: "org.apache.stormcrawler.util.URLStreamGrouping" + constructorArgs: + - "byDomain" diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml b/external/opensearch-java/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml new file mode 100644 index 000000000..25d6e4dba --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml @@ -0,0 +1,115 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# configuration for OpenSearch resources + +config: + + # address to use unless a more specific one has been + # defined for a component + # also accepts a list or multiple values in a single line + # separated by a semi-colon e.g. "opensearch1:9200; opensearch2:9200" + opensearch.addresses: "http://localhost:9200" + #opensearch.user: "USERNAME" + #opensearch.password: "PASSWORD" + opensearch.concurrentRequests: 2 + + # Disable TLS validation for connection to OpenSearch + # opensearch.disable.tls.validation: false + + # Indexer bolt + # addresses can be specified as a full URL + # if not we assume that the protocol is http and the port 9200 + opensearch.indexer.addresses: "localhost" + opensearch.indexer.index.name: "content" + # opensearch.indexer.pipeline: "_PIPELINE_" + opensearch.indexer.create: false + opensearch.indexer.bulkActions: 100 + opensearch.indexer.flushInterval: "2s" + opensearch.indexer.concurrentRequests: 1 + opensearch.indexer.sniff: true + + # MetricsConsumer + # opensearch.metrics.addresses: "http://localhost:9200" + opensearch.metrics.index.name: "metrics" + opensearch.metrics.sniff: true + + # Spout and persistence bolt + opensearch.status.addresses: "http://localhost:9200" + opensearch.status.index.name: "status" + #opensearch.status.user: "USERNAME" + #opensearch.status.password: "PASSWORD" + # the routing is done on the value of 'partition.url.mode' + opensearch.status.routing: true + # stores the value used for grouping the URLs as a separate field + # needed by the spout implementations + # also used for routing if the value above is set to true + opensearch.status.routing.fieldname: "key" + opensearch.status.bulkActions: 500 + opensearch.status.flushInterval: "5s" + opensearch.status.concurrentRequests: 1 + opensearch.status.sniff: true + + # spout config # + + # positive or negative filters parsable by the Lucene Query Parser + # opensearch.status.filterQuery: + # - "-(key:stormcrawler.net)" + # - "-(key:stormcrawler.apache.org)" + + # time in secs for which the URLs will be considered for fetching after a ack of fail + spout.ttl.purgatory: 30 + + # Min time (in msecs) to allow between 2 successive queries to OpenSearch + spout.min.delay.queries: 2000 + + # Max time (in msecs) to allow between 2 successive queries to OpenSearch + spout.max.delay.queries: 20000 + + # Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time + # Setting this to -1 or a large value means that OpenSearch will cache the results but also that fewer and fewer + # results might be returned. + spout.reset.fetchdate.after: 120 + + opensearch.status.max.buckets: 50 + opensearch.status.max.urls.per.bucket: 2 + # field to group the URLs into buckets + opensearch.status.bucket.field: "key" + # fields to sort the URLs within a bucket + opensearch.status.bucket.sort.field: + - "nextFetchDate" + - "url" + # field to sort the buckets + opensearch.status.global.sort.field: "nextFetchDate" + + # AggregationSpout : sampling improves the performance on large crawls + opensearch.status.sample: false + + # max allowed duration of a query in sec + opensearch.status.query.timeout: -1 + + # AggregationSpout (expert): adds this value in mins to the latest date returned in the results and + # use it as nextFetchDate + opensearch.status.recentDate.increase: -1 + opensearch.status.recentDate.min.gap: -1 + + topology.metrics.consumer.register: + - class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer" + parallelism.hint: 1 + #whitelist: + # - "fetcher_counter" + # - "fetcher_average.bytes_fetched" + #blacklist: + # - "__receive.*" diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/pom.xml b/external/opensearch-java/archetype/src/main/resources/archetype-resources/pom.xml new file mode 100644 index 000000000..cdfb7204f --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/pom.xml @@ -0,0 +1,149 @@ + + + + + + + 4.0.0 + ${groupId} + ${artifactId} + ${version} + jar + + ${artifactId} + + + UTF-8 + ${StormCrawlerVersion} + 2.8.5 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.11.0 + + 17 + 17 + + + + org.codehaus.mojo + exec-maven-plugin + 3.1.0 + + + + exec + + + + + java + true + false + compile + + + + org.apache.maven.plugins + maven-shade-plugin + 3.5.0 + + + package + + shade + + + false + + + + org.apache.storm.flux.Flux + + + + + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + org.apache.storm:flux-core + + org/apache/commons/** + org/apache/http/** + org/yaml/** + + + + + + + + + + + + + org.apache.stormcrawler + stormcrawler-core + ${stormcrawler.version} + + + org.apache.stormcrawler + stormcrawler-opensearch-java + ${stormcrawler.version} + + + org.apache.storm + storm-client + ${storm.version} + provided + + + org.apache.storm + flux-core + ${storm.version} + + + org.apache.stormcrawler + stormcrawler-tika + ${stormcrawler.version} + + + diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt new file mode 100644 index 000000000..389ef587b --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt @@ -0,0 +1,32 @@ +# skip file: ftp: and mailto: urls +-^(file|ftp|mailto): + +# skip image and other suffixes we can't parse or are not likely to be relevant +# if you want to crawl images or videos or archives then you should comment out this line +-(?i)\.(apk|deb|cab|iso|gif|jpg|png|svg|ico|css|sit|eps|wmf|rar|tar|jar|zip|gz|bz2|rpm|tgz|mov|exe|jpeg|jpe|bmp|js|mpg|mp3|mp4|m4a|ogv|kml|wmv|swf|flv|mkv|m4v|webm|ra|wma|wav|avi|xspf|m3u)(\?|&|$) + +# skip URLs with slash-delimited segment that repeats 3+ times, to break loops +# very time-consuming : use BasicURLFilter instead +# -.*(/[^/]+)/[^/]+\1/[^/]+\1/ + +# exclude localhost and equivalents to avoid that information +# can be leaked by placing faked links pointing to web interfaces +# of services running on the crawling machine (e.g., Elasticsearch, +# Storm) +# +# - exclude localhost and loop-back addresses +# http://localhost:8080 +# http://127.0.0.1/ .. http://127.255.255.255/ +# http://[::1]/ +-^https?://(?:localhost|127(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){3}|\[::1\])(?::\d+)?(?:/|$) +# +# - exclude private IP address spaces +# 10.0.0.0/8 +-^https?://(?:10(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){3})(?::\d+)?(?:/|$) +# 192.168.0.0/16 +-^https?://(?:192\.168(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){2})(?::\d+)?(?:/|$) +# 172.16.0.0/12 +-^https?://(?:172\.(?:1[6789]|2[0-9]|3[01])(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){2})(?::\d+)?(?:/|$) + +# accept anything else ++. diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml new file mode 100644 index 000000000..accea7b5c --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml @@ -0,0 +1,78 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/indexer.mapping b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/indexer.mapping new file mode 100644 index 000000000..fc6eb887f --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/indexer.mapping @@ -0,0 +1,40 @@ +{ + "settings": { + "index": { + "number_of_shards": 5, + "number_of_replicas": 1, + "refresh_interval": "60s" + } + }, + "mappings": { + "_source": { + "enabled": true + }, + "properties": { + "content": { + "type": "text" + }, + "description": { + "type": "text" + }, + "domain": { + "type": "keyword" + }, + "format": { + "type": "keyword" + }, + "keywords": { + "type": "keyword" + }, + "host": { + "type": "keyword" + }, + "title": { + "type": "text" + }, + "url": { + "type": "keyword" + } + } + } +} diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json new file mode 100644 index 000000000..4d87d8d5a --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json @@ -0,0 +1,27 @@ +{ + "org.apache.stormcrawler.parse.JSoupFilters": [ + { + "class": "org.apache.stormcrawler.jsoup.XPathFilter", + "name": "XPathFilter", + "params": { + "canonical": "//*[@rel=\"canonical\"]/@href", + "parse.description": [ + "//*[@name=\"description\"]/@content", + "//*[@name=\"Description\"]/@content" + ], + "parse.title": [ + "//TITLE/allText()", + "//META[@name=\"title\"]/@content" + ], + "parse.keywords": "//META[@name=\"keywords\"]/@content" + } + }, + { + "class": "org.apache.stormcrawler.jsoup.LinkParseFilter", + "name": "LinkParseFilter", + "params": { + "pattern": "//FRAME/@src" + } + } + ] +} diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/metrics.mapping b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/metrics.mapping new file mode 100644 index 000000000..fc6ae3a09 --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/metrics.mapping @@ -0,0 +1,40 @@ +{ + "index_patterns": "metrics*", + "settings": { + "index": { + "number_of_shards": 1, + "refresh_interval": "30s" + }, + "number_of_replicas": 0 + }, + "mappings": { + "_source": { "enabled": true }, + "properties": { + "name": { + "type": "keyword" + }, + "stormId": { + "type": "keyword" + }, + "srcComponentId": { + "type": "keyword" + }, + "srcTaskId": { + "type": "short" + }, + "srcWorkerHost": { + "type": "keyword" + }, + "srcWorkerPort": { + "type": "integer" + }, + "timestamp": { + "type": "date", + "format": "date_optional_time" + }, + "value": { + "type": "double" + } + } + } +} diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json new file mode 100644 index 000000000..5d525830d --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json @@ -0,0 +1,23 @@ +{ + "org.apache.stormcrawler.parse.ParseFilters": [ + { + "class": "org.apache.stormcrawler.parse.filter.DomainParseFilter", + "name": "DomainParseFilter", + "params": { + "key": "domain", + "byHost": false + } + }, + { + "class": "org.apache.stormcrawler.parse.filter.MimeTypeNormalization", + "name": "MimeTypeNormalization" + }, + { + "class": "org.apache.stormcrawler.parse.filter.CommaSeparatedToMultivaluedMetadata", + "name": "CommaSeparatedToMultivaluedMetadata", + "params": { + "keys": ["parse.keywords"] + } + } + ] +} diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/status.mapping b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/status.mapping new file mode 100644 index 000000000..e5b14fe97 --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/status.mapping @@ -0,0 +1,39 @@ +{ + "settings": { + "index": { + "number_of_shards": 10, + "number_of_replicas": 1, + "refresh_interval": "5s" + } + }, + "mappings": { + "dynamic_templates": [{ + "metadata": { + "path_match": "metadata.*", + "match_mapping_type": "string", + "mapping": { + "type": "keyword" + } + } + }], + "_source": { + "enabled": true + }, + "properties": { + "key": { + "type": "keyword", + "index": true + }, + "nextFetchDate": { + "type": "date", + "format": "date_optional_time" + }, + "status": { + "type": "keyword" + }, + "url": { + "type": "keyword" + } + } + } +} diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json new file mode 100644 index 000000000..6098631bb --- /dev/null +++ b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json @@ -0,0 +1,60 @@ +{ + "org.apache.stormcrawler.filtering.URLFilters": [ + { + "class": "org.apache.stormcrawler.filtering.basic.BasicURLFilter", + "name": "BasicURLFilter", + "params": { + "maxPathRepetition": 3, + "maxLength": 1024 + } + }, + { + "class": "org.apache.stormcrawler.filtering.depth.MaxDepthFilter", + "name": "MaxDepthFilter", + "params": { + "maxDepth": -1 + } + }, + { + "class": "org.apache.stormcrawler.filtering.basic.BasicURLNormalizer", + "name": "BasicURLNormalizer", + "params": { + "removeAnchorPart": true, + "unmangleQueryString": true, + "checkValidURI": true, + "removeHashes": true, + "hostIDNtoASCII": true + } + }, + { + "class": "org.apache.stormcrawler.filtering.host.HostURLFilter", + "name": "HostURLFilter", + "params": { + "ignoreOutsideHost": false, + "ignoreOutsideDomain": true + } + }, + { + "class": "org.apache.stormcrawler.filtering.regex.RegexURLNormalizer", + "name": "RegexURLNormalizer", + "params": { + "regexNormalizerFile": "default-regex-normalizers.xml" + } + }, + { + "class": "org.apache.stormcrawler.filtering.regex.RegexURLFilter", + "name": "RegexURLFilter", + "params": { + "regexFilterFile": "default-regex-filters.txt" + } + }, + { + "class": "org.apache.stormcrawler.filtering.basic.SelfURLFilter", + "name": "SelfURLFilter" + }, + { + "class": "org.apache.stormcrawler.filtering.sitemap.SitemapFilter", + "name": "SitemapFilter" + } + ] +} diff --git a/external/opensearch-java/dashboards/importDashboards.sh b/external/opensearch-java/dashboards/importDashboards.sh new file mode 100755 index 000000000..561f739c1 --- /dev/null +++ b/external/opensearch-java/dashboards/importDashboards.sh @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/sh + +BIN=$(dirname $0) + +echo "Importing status dashboard into OpenSearch Dashboards" +curl -X POST "localhost:5601/api/saved_objects/_import" -H "osd-xsrf: true" --form file=@$BIN/status.ndjson +echo "" + +echo "Importing metrics dashboard into OpenSearch Dashboards" +curl -X POST "localhost:5601/api/saved_objects/_import" -H "osd-xsrf: true" --form file=@$BIN/metrics.ndjson +echo "" + +# Storm internal metrics +# curl -X POST "localhost:5601/api/saved_objects/_import" -H "kbn-xsrf: true" --form file=@$BIN/storm.ndjson diff --git a/external/opensearch-java/dashboards/metrics.ndjson b/external/opensearch-java/dashboards/metrics.ndjson new file mode 100644 index 000000000..20cbb2bc0 --- /dev/null +++ b/external/opensearch-java/dashboards/metrics.ndjson @@ -0,0 +1,10 @@ +{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:activethreads\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : # active threads","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"2\"}}],\"listeners\":{},\"title\":\"Fetcher : # active threads\"}"},"id":"Fetcher-:-#-active-threads","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.178Z","version":"WzksMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:num_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : num queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : num queues\"}"},"id":"Fetcher-:-num-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.175Z","version":"WzgsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : pages fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : pages fetched\"}"},"id":"Fetcher-:-pages-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.170Z","version":"WzcsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:in_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : URLs waiting in queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"addLegend\":false,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"mode\":\"grouped\",\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"spyPerPage\":10,\"times\":[],\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"5\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"4\"}}],\"listeners\":{},\"title\":\"Fetcher : URLs waiting in queues\"}"},"id":"Fetcher-:-URLs-waiting-in-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.160Z","version":"WzUsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.bytes_fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average bytes per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}}],\"listeners\":{},\"title\":\"Fetcher : average bytes per second\"}"},"id":"Fetcher-:-average-bytes-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.173Z","version":"WzYsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average pages per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Fetcher : average pages per second\"}"},"id":"Fetcher-:-average-pages-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.820Z","version":"WzEwLDFd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.bytes_fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Total bytes fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"m\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Total bytes fetched\"}"},"id":"Total-bytes-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.823Z","version":"WzExLDFd"} +{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}},\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelIndex\":\"1\",\"gridData\":{\"x\":24,\"y\":20,\"w\":12,\"h\":12,\"i\":\"1\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_0\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":12,\"y\":20,\"w\":12,\"h\":12,\"i\":\"2\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_1\"},{\"panelIndex\":\"3\",\"gridData\":{\"x\":0,\"y\":0,\"w\":36,\"h\":12,\"i\":\"3\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_2\"},{\"panelIndex\":\"4\",\"gridData\":{\"x\":0,\"y\":20,\"w\":12,\"h\":12,\"i\":\"4\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_3\"},{\"panelIndex\":\"5\",\"gridData\":{\"x\":0,\"y\":40,\"w\":36,\"h\":8,\"i\":\"5\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_4\"},{\"panelIndex\":\"6\",\"gridData\":{\"x\":0,\"y\":32,\"w\":36,\"h\":8,\"i\":\"6\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_5\"},{\"panelIndex\":\"7\",\"gridData\":{\"x\":0,\"y\":12,\"w\":36,\"h\":8,\"i\":\"7\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_6\"}]","timeRestore":false,"title":"Crawl metrics","version":1},"id":"Crawl-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Fetcher-:-#-active-threads","name":"panel_0","type":"visualization"},{"id":"Fetcher-:-num-queues","name":"panel_1","type":"visualization"},{"id":"Fetcher-:-pages-fetched","name":"panel_2","type":"visualization"},{"id":"Fetcher-:-URLs-waiting-in-queues","name":"panel_3","type":"visualization"},{"id":"Fetcher-:-average-bytes-per-second","name":"panel_4","type":"visualization"},{"id":"Fetcher-:-average-pages-per-second","name":"panel_5","type":"visualization"},{"id":"Total-bytes-fetched","name":"panel_6","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:06:58.830Z","version":"WzQsMV0="} +{"exportedCount":9,"missingRefCount":0,"missingReferences":[]} diff --git a/external/opensearch-java/dashboards/status.ndjson b/external/opensearch-java/dashboards/status.ndjson new file mode 100644 index 000000000..b3d0122e4 --- /dev/null +++ b/external/opensearch-java/dashboards/status.ndjson @@ -0,0 +1,5 @@ +{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"key\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"metadata._redirTo\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.depth\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Ecause\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Esource\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.fetch%2Eerror%2Ecount\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isFeed\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isSitemap\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.url%2Epath\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"nextFetchDate\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"status\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"url\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":true,\"readFromDocValues\":true}]","title":"status"},"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:07:47.130Z","version":"WzEzLDFd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"status count","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"status\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"status count\"}"},"id":"status-count","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.278Z","version":"WzE1LDFd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Top Hosts","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"key\",\"size\":50,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"Top Hosts\"}"},"id":"Top-Hosts","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.281Z","version":"WzE2LDFd"} +{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"37874bbf-6607-435a-a231-94d81e9193e7\",\"gridData\":{\"x\":0,\"y\":0,\"w\":16,\"h\":20,\"i\":\"37874bbf-6607-435a-a231-94d81e9193e7\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"4faa5b74-1660-44f7-9227-89d900c8231e\",\"gridData\":{\"x\":16,\"y\":0,\"w\":16,\"h\":20,\"i\":\"4faa5b74-1660-44f7-9227-89d900c8231e\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Crawl status","version":1},"id":"Crawl-status","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"status-count","name":"panel_0","type":"visualization"},{"id":"Top-Hosts","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:07:47.948Z","version":"WzE0LDFd"} +{"exportedCount":4,"missingRefCount":0,"missingReferences":[]} diff --git a/external/opensearch-java/dashboards/storm.ndjson b/external/opensearch-java/dashboards/storm.ndjson new file mode 100644 index 000000000..1d25d1f6e --- /dev/null +++ b/external/opensearch-java/dashboards/storm.ndjson @@ -0,0 +1,5 @@ +{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name: \\\"__receive.population\\\"\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Storm Receive Queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcTaskId\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcComponentId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}"},"id":"Storm-Receive-Queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.875Z","version":"WzIwLDFd"} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"+srcComponentId: \\\"__system\\\" +name: memory\\\\/heap*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Memory Heap","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":true,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"name\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"split\",\"params\":{\"field\":\"srcWorkerHost\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\",\"row\":true}}],\"listeners\":{}}"},"id":"Memory-Heap","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.877Z","version":"WzIxLDFd"} +{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\",\"gridData\":{\"x\":0,\"y\":0,\"w\":32,\"h\":8,\"i\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\",\"gridData\":{\"x\":0,\"y\":8,\"w\":32,\"h\":16,\"i\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Storm metrics","version":1},"id":"Storm-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Storm-Receive-Queues","name":"panel_0","type":"visualization"},{"id":"Memory-Heap","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:08:33.810Z","version":"WzE5LDFd"} +{"exportedCount":4,"missingRefCount":0,"missingReferences":[]} diff --git a/external/opensearch-java/opensearch-conf.yaml b/external/opensearch-java/opensearch-conf.yaml new file mode 100644 index 000000000..d1d817deb --- /dev/null +++ b/external/opensearch-java/opensearch-conf.yaml @@ -0,0 +1,128 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# configuration for OpenSearch resources + +config: + + # address to use unless a more specific one has been + # defined for a component + # also accepts a list or multiple values in a single line + # separated by a semi-colon e.g. "opensearch1:9200; opensearch2:9200" + opensearch.addresses: "http://localhost:9200" + #opensearch.user: "USERNAME" + #opensearch.password: "PASSWORD" + opensearch.concurrentRequests: 2 + + # Sets the response buffer to the specified value in MB. + # opensearch.responseBufferSize: 100 + + # Disable TLS validation for connection to OpenSearch + # opensearch.disable.tls.validation: false + + # Indexer bolt + # addresses can be specified as a full URL + # if not we assume that the protocol is http and the port 9200 + opensearch.indexer.addresses: "localhost" + opensearch.indexer.index.name: "content" + # opensearch.indexer.pipeline: "_PIPELINE_" + opensearch.indexer.create: false + opensearch.indexer.bulkActions: 100 + opensearch.indexer.flushInterval: "2s" + opensearch.indexer.concurrentRequests: 1 + opensearch.indexer.sniff: true + # Sets the response buffer to the specified value in MB. + # opensearch.indexer.responseBufferSize: 100 + + # MetricsConsumer + # opensearch.metrics.addresses: "http://localhost:9200" + opensearch.metrics.index.name: "metrics" + opensearch.metrics.sniff: true + # Sets the response buffer to the specified value in MB. + # opensearch.metrics.responseBufferSize: 100 + + # Spout and persistence bolt + opensearch.status.addresses: "http://localhost:9200" + opensearch.status.index.name: "status" + #opensearch.status.user: "USERNAME" + #opensearch.status.password: "PASSWORD" + # the routing is done on the value of 'partition.url.mode' + opensearch.status.routing: true + # stores the value used for grouping the URLs as a separate field + # needed by the spout implementations + # also used for routing if the value above is set to true + opensearch.status.routing.fieldname: "key" + opensearch.status.bulkActions: 500 + opensearch.status.flushInterval: "5s" + opensearch.status.concurrentRequests: 1 + opensearch.status.sniff: true + # Sets the response buffer to the specified value in MB. + # opensearch.status.responseBufferSize: 100 + + # spout config # + + # positive or negative filters parsable by the Lucene Query Parser + # opensearch.status.filterQuery: + # - "-(key:stormcrawler.net)" + # - "-(key:apache.stormcrawler.org)" + + # time in secs for which the URLs will be considered for fetching after a ack of fail + spout.ttl.purgatory: 30 + + # Min time (in msecs) to allow between 2 successive queries to OpenSearch + spout.min.delay.queries: 2000 + + # Max time (in msecs) to allow between 2 successive queries to OpenSearch + spout.max.delay.queries: 20000 + + # Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time + # Setting this to -1 or a large value means that OpenSearch will cache the results but also that fewer and fewer + # results might be returned. + spout.reset.fetchdate.after: 120 + + opensearch.status.max.buckets: 50 + opensearch.status.max.urls.per.bucket: 2 + # field to group the URLs into buckets + opensearch.status.bucket.field: "key" + # fields to sort the URLs within a bucket + opensearch.status.bucket.sort.field: + - "nextFetchDate" + - "url" + # field to sort the buckets + opensearch.status.global.sort.field: "nextFetchDate" + + # AggregationSpout : sampling improves the performance on large crawls + opensearch.status.sample: false + + # max allowed duration of a query in sec + opensearch.status.query.timeout: -1 + + # AggregationSpout (expert): adds this value in mins to the latest date returned in the results and + # use it as nextFetchDate + opensearch.status.recentDate.increase: -1 + opensearch.status.recentDate.min.gap: -1 + + # Caffeine cache specification for the waitAck cache used in StatusUpdaterBolt. + # If not set, the value of topology.message.timeout.secs is used for expireAfterWrite (default: 300s) + # opensearch.status.waitack.cache.spec: "maximumSize=10000,expireAfterWrite=300s" + + topology.metrics.consumer.register: + - class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer" + parallelism.hint: 1 + #whitelist: + # - "fetcher_counter" + # - "fetcher_average.bytes_fetched" + #blacklist: + # - "__receive.*" diff --git a/external/opensearch-java/pom.xml b/external/opensearch-java/pom.xml new file mode 100644 index 000000000..376a11486 --- /dev/null +++ b/external/opensearch-java/pom.xml @@ -0,0 +1,121 @@ + + + + + + 4.0.0 + + + org.apache.stormcrawler + stormcrawler-external + 3.5.2-SNAPSHOT + ../pom.xml + + + + 2.19.5 + true + 0.27 + 0.27 + 0.25 + 0.17 + 0.29 + 0.13 + + + stormcrawler-opensearch-java + jar + + stormcrawler-opensearch-java + + https://github.com/apache/stormcrawler/tree/master/external/opensearch + OpenSearch module for Apache StormCrawler using the new opensearch-java client + + + + + maven-surefire-plugin + + + default-test + test + + test + + + + + + ${opensearch.version} + + + + + + + + + org.opensearch.client + opensearch-rest-high-level-client + ${opensearch.version} + + + + org.awaitility + awaitility + test + + + + + org.opensearch.client + opensearch-rest-client-sniffer + ${opensearch.version} + + + + org.apache.stormcrawler + stormcrawler-core + ${project.version} + test-jar + test + + + + org.testcontainers + testcontainers + test + + + + org.testcontainers + junit-jupiter + test + + + + org.slf4j + slf4j-simple + test + + + + diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/BulkItemResponseToFailedFlag.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/BulkItemResponseToFailedFlag.java new file mode 100644 index 000000000..e4eec09ef --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/BulkItemResponseToFailedFlag.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch; + +import java.io.IOException; +import org.jetbrains.annotations.NotNull; +import org.opensearch.action.DocWriteRequest; +import org.opensearch.action.DocWriteResponse; +import org.opensearch.action.bulk.BulkItemResponse; +import org.opensearch.core.common.io.stream.StreamOutput; +import org.opensearch.core.rest.RestStatus; +import org.opensearch.core.xcontent.ToXContent; +import org.opensearch.core.xcontent.XContentBuilder; + +public final class BulkItemResponseToFailedFlag { + @NotNull public final BulkItemResponse response; + public final boolean failed; + @NotNull public final String id; + + public BulkItemResponseToFailedFlag(@NotNull BulkItemResponse response, boolean failed) { + this.response = response; + this.failed = failed; + this.id = response.getId(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof BulkItemResponseToFailedFlag)) { + return false; + } + + BulkItemResponseToFailedFlag that = (BulkItemResponseToFailedFlag) o; + + if (failed != that.failed) { + return false; + } + if (!response.equals(that.response)) { + return false; + } + return id.equals(that.id); + } + + @Override + public int hashCode() { + int result = response.hashCode(); + result = 31 * result + (failed ? 1 : 0); + result = 31 * result + id.hashCode(); + return result; + } + + @Override + public String toString() { + return "BulkItemResponseToFailedFlag{" + + "response=" + + response + + ", failed=" + + failed + + ", id='" + + id + + '\'' + + '}'; + } + + public RestStatus status() { + return response.status(); + } + + public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params params) + throws IOException { + return response.toXContent(builder, params); + } + + public int getItemId() { + return response.getItemId(); + } + + public DocWriteRequest.OpType getOpType() { + return response.getOpType(); + } + + public String getIndex() { + return response.getIndex(); + } + + public long getVersion() { + return response.getVersion(); + } + + public T getResponse() { + return response.getResponse(); + } + + public boolean isFailed() { + return response.isFailed(); + } + + public String getFailureMessage() { + return response.getFailureMessage(); + } + + public BulkItemResponse.Failure getFailure() { + return response.getFailure(); + } + + public void writeTo(StreamOutput out) throws IOException { + response.writeTo(out); + } + + public void writeThin(StreamOutput out) throws IOException { + response.writeThin(out); + } + + public boolean isFragment() { + return response.isFragment(); + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/Constants.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/Constants.java new file mode 100644 index 000000000..8c0cbc989 --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/Constants.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch; + +public interface Constants { + + String PARAMPREFIX = "opensearch."; +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/IndexCreation.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/IndexCreation.java new file mode 100644 index 000000000..180a10743 --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/IndexCreation.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch; + +import com.google.common.base.Charsets; +import com.google.common.io.Resources; +import java.io.IOException; +import java.net.URL; +import org.opensearch.OpenSearchException; +import org.opensearch.action.support.master.AcknowledgedResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.indices.CreateIndexRequest; +import org.opensearch.client.indices.CreateIndexResponse; +import org.opensearch.client.indices.GetIndexRequest; +import org.opensearch.client.indices.IndexTemplatesExistRequest; +import org.opensearch.client.indices.PutIndexTemplateRequest; +import org.opensearch.common.xcontent.XContentType; +import org.slf4j.Logger; + +public class IndexCreation { + + public static synchronized void checkOrCreateIndex( + RestHighLevelClient client, String indexName, String boltType, Logger log) + throws IOException { + final boolean indexExists = + client.indices().exists(new GetIndexRequest(indexName), RequestOptions.DEFAULT); + log.info("Index '{}' exists? {}", indexName, indexExists); + // there's a possible check-then-update race condition + // createIndex intentionally catches and logs exceptions from OpenSearch + if (!indexExists) { + boolean created = + IndexCreation.createIndex(client, indexName, boltType + ".mapping", log); + log.info("Index '{}' created? {} using {}", indexName, created, boltType + ".mapping"); + } + } + + public static synchronized void checkOrCreateIndexTemplate( + RestHighLevelClient client, String boltType, Logger log) throws IOException { + final String templateName = boltType + "-template"; + final boolean templateExists = + client.indices() + .existsTemplate( + new IndexTemplatesExistRequest(templateName), + RequestOptions.DEFAULT); + log.info("Template '{}' exists? {}", templateName, templateExists); + // there's a possible check-then-update race condition + // createTemplate intentionally catches and logs exceptions from OpenSearch + if (!templateExists) { + boolean created = + IndexCreation.createTemplate(client, templateName, boltType + ".mapping", log); + log.info("templateExists '{}' created? {}", templateName, created); + } + } + + private static boolean createTemplate( + RestHighLevelClient client, String templateName, String resourceName, Logger log) { + + try { + final PutIndexTemplateRequest createIndexRequest = + new PutIndexTemplateRequest(templateName); + + final URL mapping = + Thread.currentThread().getContextClassLoader().getResource(resourceName); + + final String jsonIndexConfiguration = Resources.toString(mapping, Charsets.UTF_8); + + createIndexRequest.source(jsonIndexConfiguration, XContentType.JSON); + + final AcknowledgedResponse createIndexResponse = + client.indices().putTemplate(createIndexRequest, RequestOptions.DEFAULT); + return createIndexResponse.isAcknowledged(); + } catch (IOException | OpenSearchException e) { + log.warn("template '{}' not created", templateName, e); + return false; + } + } + + private static boolean createIndex( + RestHighLevelClient client, String indexName, String resourceName, Logger log) { + + try { + + final CreateIndexRequest createIndexRequest = new CreateIndexRequest(indexName); + + final URL mapping = + Thread.currentThread().getContextClassLoader().getResource(resourceName); + + final String jsonIndexConfiguration = Resources.toString(mapping, Charsets.UTF_8); + + createIndexRequest.source(jsonIndexConfiguration, XContentType.JSON); + + final CreateIndexResponse createIndexResponse = + client.indices().create(createIndexRequest, RequestOptions.DEFAULT); + return createIndexResponse.isAcknowledged(); + } catch (IOException | OpenSearchException e) { + log.warn("index '{}' not created", indexName, e); + return false; + } + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java new file mode 100644 index 000000000..c3662a098 --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java @@ -0,0 +1,349 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch; + +import static org.opensearch.client.RestClientBuilder.DEFAULT_CONNECT_TIMEOUT_MILLIS; +import static org.opensearch.client.RestClientBuilder.DEFAULT_SOCKET_TIMEOUT_MILLIS; + +import java.io.IOException; +import java.net.URI; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.StringUtils; +import org.apache.http.HttpHost; +import org.apache.http.auth.AuthScope; +import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.client.CredentialsProvider; +import org.apache.http.conn.ssl.NoopHostnameVerifier; +import org.apache.http.conn.ssl.TrustAllStrategy; +import org.apache.http.impl.client.BasicCredentialsProvider; +import org.apache.http.ssl.SSLContextBuilder; +import org.apache.stormcrawler.util.ConfUtils; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; +import org.opensearch.action.DocWriteRequest; +import org.opensearch.action.bulk.BulkProcessor; +import org.opensearch.action.bulk.BulkRequest; +import org.opensearch.action.bulk.BulkResponse; +import org.opensearch.client.HttpAsyncResponseConsumerFactory; +import org.opensearch.client.Node; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestClient; +import org.opensearch.client.RestClientBuilder; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.sniff.Sniffer; +import org.opensearch.common.unit.TimeValue; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Utility class to instantiate an OpenSearch client and bulkprocessor based on the configuration. + */ +public final class OpenSearchConnection { + + private static final Logger LOG = LoggerFactory.getLogger(OpenSearchConnection.class); + + @NotNull private final RestHighLevelClient client; + + @NotNull private final BulkProcessor processor; + + @Nullable private final Sniffer sniffer; + + private OpenSearchConnection( + @NotNull RestHighLevelClient c, @NotNull BulkProcessor p, @Nullable Sniffer s) { + processor = p; + client = c; + sniffer = s; + } + + public RestHighLevelClient getClient() { + return client; + } + + public static RestHighLevelClient getClient(Map stormConf, String boltType) { + + final String dottedType = boltType + "."; + + final List hosts = new ArrayList<>(); + + final List confighosts = + ConfUtils.loadListFromConf( + Constants.PARAMPREFIX, dottedType, "addresses", stormConf); + + // find ; separated values and tokenise as multiple addresses + // e.g. opensearch1:9200; opensearch2:9200 + if (confighosts.size() == 1) { + String input = confighosts.get(0); + confighosts.clear(); + confighosts.addAll(Arrays.asList(input.split(" *; *"))); + } + + for (String host : confighosts) { + // no port specified? use default one + int port = 9200; + String scheme = "http"; + // no scheme specified? use http + if (!host.startsWith(scheme)) { + host = "http://" + host; + } + URI uri = URI.create(host); + if (uri.getHost() == null) { + throw new RuntimeException("host undefined " + host); + } + if (uri.getPort() != -1) { + port = uri.getPort(); + } + if (uri.getScheme() != null) { + scheme = uri.getScheme(); + } + hosts.add(new HttpHost(uri.getHost(), port, scheme)); + } + + final RestClientBuilder builder = RestClient.builder(hosts.toArray(new HttpHost[0])); + + // authentication via user / password + final String user = + ConfUtils.getString(stormConf, Constants.PARAMPREFIX, dottedType, "user"); + final String password = + ConfUtils.getString(stormConf, Constants.PARAMPREFIX, dottedType, "password"); + + final String proxyhost = + ConfUtils.getString(stormConf, Constants.PARAMPREFIX, dottedType, "proxy.host"); + + final int proxyport = + ConfUtils.getInt(stormConf, Constants.PARAMPREFIX, dottedType, "proxy.port", -1); + + final String proxyscheme = + ConfUtils.getString( + stormConf, Constants.PARAMPREFIX, dottedType, "proxy.scheme", "http"); + + final boolean disableTlsValidation = + ConfUtils.getBoolean( + stormConf, Constants.PARAMPREFIX, "", "disable.tls.validation", false); + + final boolean needsUser = StringUtils.isNotBlank(user) && StringUtils.isNotBlank(password); + final boolean needsProxy = StringUtils.isNotBlank(proxyhost) && proxyport != -1; + + if (needsUser || needsProxy || disableTlsValidation) { + builder.setHttpClientConfigCallback( + httpClientBuilder -> { + if (needsUser) { + final CredentialsProvider credentialsProvider = + new BasicCredentialsProvider(); + credentialsProvider.setCredentials( + AuthScope.ANY, new UsernamePasswordCredentials(user, password)); + httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider); + } + if (needsProxy) { + httpClientBuilder.setProxy( + new HttpHost(proxyhost, proxyport, proxyscheme)); + } + + if (disableTlsValidation) { + try { + final SSLContextBuilder sslContext = new SSLContextBuilder(); + sslContext.loadTrustMaterial(null, new TrustAllStrategy()); + httpClientBuilder.setSSLContext(sslContext.build()); + httpClientBuilder.setSSLHostnameVerifier( + NoopHostnameVerifier.INSTANCE); + } catch (Exception e) { + throw new RuntimeException("Failed to disable TLS validation", e); + } + } + return httpClientBuilder; + }); + } + + final int connectTimeout = + ConfUtils.getInt( + stormConf, + Constants.PARAMPREFIX, + dottedType, + "connect.timeout", + DEFAULT_CONNECT_TIMEOUT_MILLIS); + final int socketTimeout = + ConfUtils.getInt( + stormConf, + Constants.PARAMPREFIX, + dottedType, + "socket.timeout", + DEFAULT_SOCKET_TIMEOUT_MILLIS); + // timeout until connection is established + builder.setRequestConfigCallback( + requestConfigBuilder -> + requestConfigBuilder + .setConnectTimeout(connectTimeout) + // Timeout when waiting for data + .setSocketTimeout(socketTimeout)); + + // TODO check if this has gone somewhere else + // int maxRetryTimeout = ConfUtils.getInt(stormConf, Constants.PARAMPREFIX + + // boltType + + // ".max.retry.timeout", + // DEFAULT_MAX_RETRY_TIMEOUT_MILLIS); + // builder.setMaxRetryTimeoutMillis(maxRetryTimeout); + + // TODO configure headers etc... + // Map configSettings = (Map) stormConf + // .get(Constants.PARAMPREFIX + boltType + ".settings"); + // if (configSettings != null) { + // configSettings.forEach((k, v) -> settings.put(k, v)); + // } + + // use node selector only to log nodes listed in the config + // and/or discovered through sniffing + builder.setNodeSelector( + nodes -> { + for (Node node : nodes) { + LOG.debug( + "Connected to OpenSearch node {} [{}] for {}", + node.getName(), + node.getHost(), + boltType); + } + }); + + final boolean compression = + ConfUtils.getBoolean( + stormConf, Constants.PARAMPREFIX, dottedType, "compression", false); + + builder.setCompressionEnabled(compression); + + return new RestHighLevelClient(builder); + } + + public void addToProcessor(final DocWriteRequest request) { + processor.add(request); + } + + /** + * Creates a connection with a default listener. The values for bolt type are + * [indexer,status,metrics] + */ + public static OpenSearchConnection getConnection( + Map stormConf, String boltType) { + BulkProcessor.Listener listener = + new BulkProcessor.Listener() { + @Override + public void afterBulk(long arg0, BulkRequest arg1, BulkResponse arg2) {} + + @Override + public void afterBulk(long arg0, BulkRequest arg1, Throwable arg2) {} + + @Override + public void beforeBulk(long arg0, BulkRequest arg1) {} + }; + return getConnection(stormConf, boltType, listener); + } + + public static OpenSearchConnection getConnection( + Map stormConf, String boltType, BulkProcessor.Listener listener) { + + final RestHighLevelClient client = getClient(stormConf, boltType); + + final String dottedType = boltType + "."; + + final String flushIntervalString = + ConfUtils.getString( + stormConf, Constants.PARAMPREFIX, dottedType, "flushInterval", "5s"); + + final TimeValue flushInterval = + TimeValue.parseTimeValue( + flushIntervalString, TimeValue.timeValueSeconds(5), "flushInterval"); + + final int bulkActions = + ConfUtils.getInt(stormConf, Constants.PARAMPREFIX, dottedType, "bulkActions", 50); + + final int concurrentRequests = + ConfUtils.getInt( + stormConf, Constants.PARAMPREFIX, dottedType, "concurrentRequests", 1); + + final RequestOptions requestOptions = RequestOptions.DEFAULT; + final RequestOptions.Builder requestOptionsBuilder = requestOptions.toBuilder(); + final int bufferSize = + ConfUtils.getInt( + stormConf, Constants.PARAMPREFIX, dottedType, "responseBufferSize", 100); + + requestOptionsBuilder.setHttpAsyncResponseConsumerFactory( + new HttpAsyncResponseConsumerFactory.HeapBufferedResponseConsumerFactory( + bufferSize * 1024 * 1024)); + + final BulkProcessor bulkProcessor = + BulkProcessor.builder( + (request, bulkListener) -> + client.bulkAsync( + request, + requestOptionsBuilder.build(), + bulkListener), + listener) + .setFlushInterval(flushInterval) + .setBulkActions(bulkActions) + .setConcurrentRequests(concurrentRequests) + .build(); + + boolean sniff = + ConfUtils.getBoolean(stormConf, Constants.PARAMPREFIX, dottedType, "sniff", true); + Sniffer sniffer = null; + if (sniff) { + sniffer = Sniffer.builder(client.getLowLevelClient()).build(); + } + + return new OpenSearchConnection(client, bulkProcessor, sniffer); + } + + private boolean isClosed = false; + + public void close() { + + if (isClosed) { + LOG.warn("Tried to close an already closed connection!"); + return; + } + + // Maybe some kind of identifier? + LOG.debug("Start closing the OpenSearch connection"); + + // First, close the BulkProcessor ensuring pending actions are flushed + try { + boolean success = processor.awaitClose(60, TimeUnit.SECONDS); + if (!success) { + throw new RuntimeException( + "Failed to flush pending actions when closing BulkProcessor"); + } + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + + if (sniffer != null) { + sniffer.close(); + } + + // Now close the actual client + try { + client.close(); + } catch (IOException e) { + // ignore silently + LOG.trace("Client threw IO exception."); + } + + isClosed = true; + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java new file mode 100644 index 000000000..c67b90951 --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java @@ -0,0 +1,318 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.bolt; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.RemovalCause; +import com.github.benmanes.caffeine.cache.RemovalListener; +import java.lang.invoke.MethodHandles; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.ReentrantLock; +import java.util.stream.Collectors; +import org.apache.storm.task.OutputCollector; +import org.apache.storm.task.TopologyContext; +import org.apache.storm.topology.OutputFieldsDeclarer; +import org.apache.storm.topology.base.BaseRichBolt; +import org.apache.storm.tuple.Tuple; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.opensearch.BulkItemResponseToFailedFlag; +import org.apache.stormcrawler.opensearch.OpenSearchConnection; +import org.apache.stormcrawler.util.ConfUtils; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; +import org.opensearch.action.DocWriteRequest; +import org.opensearch.action.bulk.BulkItemResponse; +import org.opensearch.action.bulk.BulkProcessor.Listener; +import org.opensearch.action.bulk.BulkRequest; +import org.opensearch.action.bulk.BulkResponse; +import org.opensearch.action.delete.DeleteRequest; +import org.opensearch.core.rest.RestStatus; +import org.slf4j.LoggerFactory; + +/** + * Deletes documents in OpenSearch. This should be connected to the StatusUpdaterBolt via the + * 'deletion' stream and will remove the documents with a status of ERROR. Note that this component + * will also try to delete documents even though they were never indexed and it currently won't + * delete documents which were indexed under the canonical URL. + */ +public class DeletionBolt extends BaseRichBolt + implements RemovalListener>, Listener { + + static final org.slf4j.Logger LOG = + LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private static final String BOLT_TYPE = "indexer"; + + private OutputCollector _collector; + + private String indexName; + + private OpenSearchConnection connection; + + private Cache> waitAck; + + // Be fair due to cache timeout + private final ReentrantLock waitAckLock = new ReentrantLock(true); + + public DeletionBolt() {} + + /** Sets the index name instead of taking it from the configuration. * */ + public DeletionBolt(String indexName) { + this.indexName = indexName; + } + + @Override + public void prepare( + Map conf, TopologyContext context, OutputCollector collector) { + _collector = collector; + if (indexName == null) { + indexName = ConfUtils.getString(conf, IndexerBolt.OSIndexNameParamName, "content"); + } + + try { + connection = OpenSearchConnection.getConnection(conf, BOLT_TYPE, this); + } catch (Exception e1) { + LOG.error("Can't connect to opensearch", e1); + throw new RuntimeException(e1); + } + + waitAck = + Caffeine.newBuilder() + .expireAfterWrite(60, TimeUnit.SECONDS) + .removalListener(this) + .build(); + + context.registerMetric("waitAck", () -> waitAck.estimatedSize(), 10); + } + + public void onRemoval( + @Nullable String key, @Nullable List value, @NotNull RemovalCause cause) { + if (!cause.wasEvicted()) { + return; + } + if (value != null) { + LOG.error("Purged from waitAck {} with {} values", key, value.size()); + for (Tuple t : value) { + _collector.fail(t); + } + } else { + // This should never happen, but log it anyway. + LOG.error("Purged from waitAck {} with no values", key); + } + } + + @Override + public void cleanup() { + if (connection != null) { + connection.close(); + } + } + + @Override + public void execute(Tuple tuple) { + String url = tuple.getStringByField("url"); + Metadata metadata = (Metadata) tuple.getValueByField("metadata"); + + // keep it simple for now and ignore cases where the canonical URL was + // used + + final String docID = getDocumentID(metadata, url); + DeleteRequest dr = new DeleteRequest(getIndexName(metadata), docID); + connection.addToProcessor(dr); + + waitAckLock.lock(); + try { + List tt = waitAck.getIfPresent(docID); + if (tt == null) { + tt = new LinkedList<>(); + waitAck.put(docID, tt); + } + tt.add(tuple); + LOG.debug("Added to waitAck {} with ID {} total {}", url, docID, tt.size()); + } finally { + waitAckLock.unlock(); + } + } + + @Override + public void declareOutputFields(OutputFieldsDeclarer arg0) { + // none + } + + /** + * Must be overridden for implementing custom index names based on some metadata information By + * Default, indexName coming from config is used + */ + protected String getIndexName(Metadata m) { + return indexName; + } + + /** + * Get the document id. + * + * @param metadata The {@link Metadata}. + * @param url The normalised url. + * @return Return the normalised url SHA-256 digest as String. + */ + protected String getDocumentID(Metadata metadata, String url) { + return org.apache.commons.codec.digest.DigestUtils.sha256Hex(url); + } + + @Override + public void beforeBulk(long executionId, BulkRequest request) {} + + @Override + public void afterBulk(long executionId, BulkRequest request, BulkResponse response) { + var idsToBulkItemsWithFailedFlag = + Arrays.stream(response.getItems()) + .map( + bir -> { + String id = bir.getId(); + BulkItemResponse.Failure f = bir.getFailure(); + boolean failed = false; + if (f != null) { + if (f.getStatus().equals(RestStatus.CONFLICT)) { + LOG.debug("Doc conflict ID {}", id); + } else { + failed = true; + } + } + return new BulkItemResponseToFailedFlag(bir, failed); + }) + .collect( + // https://github.com/apache/stormcrawler/issues/832 + Collectors.groupingBy( + idWithFailedFlagTuple -> idWithFailedFlagTuple.id, + Collectors.toUnmodifiableList())); + Map> presentTuples; + long estimatedSize; + waitAckLock.lock(); + try { + presentTuples = waitAck.getAllPresent(idsToBulkItemsWithFailedFlag.keySet()); + if (!presentTuples.isEmpty()) { + waitAck.invalidateAll(presentTuples.keySet()); + } + estimatedSize = waitAck.estimatedSize(); + } finally { + waitAckLock.unlock(); + } + + int ackCount = 0; + int failureCount = 0; + + for (var entry : presentTuples.entrySet()) { + final var id = entry.getKey(); + final var associatedTuple = entry.getValue(); + final var bulkItemsWithFailedFlag = idsToBulkItemsWithFailedFlag.get(id); + + BulkItemResponseToFailedFlag selected; + + if (bulkItemsWithFailedFlag.size() == 1) { + selected = bulkItemsWithFailedFlag.get(0); + } else { + // Fallback if there are multiple responses for the same id + BulkItemResponseToFailedFlag tmp = null; + var ctFailed = 0; + for (var buwff : bulkItemsWithFailedFlag) { + if (tmp == null) { + tmp = buwff; + } + if (buwff.failed) { + ctFailed++; + } else { + tmp = buwff; + } + } + if (ctFailed != bulkItemsWithFailedFlag.size()) { + LOG.warn( + "The id {} would result in an ack and a failure. Using only the ack for processing.", + id); + } + selected = Objects.requireNonNull(tmp); + } + + if (associatedTuple != null) { + LOG.debug("Found {} tuple(s) for ID {}", associatedTuple.size(), id); + for (Tuple t : associatedTuple) { + String url = (String) t.getValueByField("url"); + + Metadata metadata = (Metadata) t.getValueByField("metadata"); + + if (!selected.failed) { + ackCount++; + _collector.ack(t); + } else { + failureCount++; + var failure = selected.getFailure(); + LOG.error("update ID {}, URL {}, failure: {}", id, url, failure); + _collector.fail(t); + } + } + } else { + LOG.warn("Could not find unacked tuples for {}", entry.getKey()); + } + } + + LOG.info( + "Bulk response [{}] : items {}, waitAck {}, acked {}, failed {}", + executionId, + idsToBulkItemsWithFailedFlag.size(), + estimatedSize, + ackCount, + failureCount); + } + + @Override + public void afterBulk(long executionId, BulkRequest request, Throwable failure) { + LOG.error("Exception with bulk {} - failing the whole lot ", executionId, failure); + + final var failedIds = + request.requests().stream() + .map(DocWriteRequest::id) + .collect(Collectors.toUnmodifiableSet()); + Map> failedTupleLists; + waitAckLock.lock(); + try { + failedTupleLists = waitAck.getAllPresent(failedIds); + if (!failedTupleLists.isEmpty()) { + waitAck.invalidateAll(failedTupleLists.keySet()); + } + } finally { + waitAckLock.unlock(); + } + + for (var id : failedIds) { + var failedTuples = failedTupleLists.get(id); + if (failedTuples != null) { + LOG.debug("Failed {} tuple(s) for ID {}", failedTuples.size(), id); + for (Tuple x : failedTuples) { + // fail it + _collector.fail(x); + } + } else { + LOG.warn("Could not find unacked tuple for {}", id); + } + } + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java new file mode 100644 index 000000000..04de31cae --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java @@ -0,0 +1,473 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.bolt; + +import static org.apache.stormcrawler.Constants.StatusStreamName; +import static org.opensearch.common.xcontent.XContentFactory.jsonBuilder; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.RemovalCause; +import com.github.benmanes.caffeine.cache.RemovalListener; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.ReentrantLock; +import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; +import org.apache.storm.metric.api.MultiCountMetric; +import org.apache.storm.metric.api.MultiReducedMetric; +import org.apache.storm.task.OutputCollector; +import org.apache.storm.task.TopologyContext; +import org.apache.storm.tuple.Tuple; +import org.apache.storm.tuple.Values; +import org.apache.stormcrawler.Constants; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.indexing.AbstractIndexerBolt; +import org.apache.stormcrawler.opensearch.BulkItemResponseToFailedFlag; +import org.apache.stormcrawler.opensearch.IndexCreation; +import org.apache.stormcrawler.opensearch.OpenSearchConnection; +import org.apache.stormcrawler.persistence.Status; +import org.apache.stormcrawler.util.ConfUtils; +import org.apache.stormcrawler.util.PerSecondReducer; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; +import org.opensearch.action.DocWriteRequest; +import org.opensearch.action.bulk.BulkItemResponse; +import org.opensearch.action.bulk.BulkProcessor; +import org.opensearch.action.bulk.BulkRequest; +import org.opensearch.action.bulk.BulkResponse; +import org.opensearch.action.index.IndexRequest; +import org.opensearch.core.rest.RestStatus; +import org.opensearch.core.xcontent.XContentBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Sends documents to opensearch. Indexes all the fields from the tuples or a Map + * <String,Object> from a named field. + */ +public class IndexerBolt extends AbstractIndexerBolt + implements RemovalListener>, BulkProcessor.Listener { + + private static final Logger LOG = LoggerFactory.getLogger(IndexerBolt.class); + + private static final String OSBoltType = "indexer"; + + static final String OSIndexNameParamName = + org.apache.stormcrawler.opensearch.Constants.PARAMPREFIX + OSBoltType + ".index.name"; + private static final String OSCreateParamName = + org.apache.stormcrawler.opensearch.Constants.PARAMPREFIX + OSBoltType + ".create"; + private static final String OSIndexPipelineParamName = + org.apache.stormcrawler.opensearch.Constants.PARAMPREFIX + OSBoltType + ".pipeline"; + + private OutputCollector _collector; + + private String indexName; + + private String pipeline; + + // whether the document will be created only if it does not exist or + // overwritten + private boolean create = false; + + private MultiCountMetric eventCounter; + + private OpenSearchConnection connection; + + private MultiReducedMetric perSecMetrics; + + private Cache> waitAck; + + // Be fair due to cache timeout + private final ReentrantLock waitAckLock = new ReentrantLock(true); + + public IndexerBolt() {} + + /** Sets the index name instead of taking it from the configuration. * */ + public IndexerBolt(String indexName) { + this.indexName = indexName; + } + + @Override + public void prepare( + Map conf, TopologyContext context, OutputCollector collector) { + super.prepare(conf, context, collector); + _collector = collector; + if (indexName == null) { + indexName = ConfUtils.getString(conf, IndexerBolt.OSIndexNameParamName, "content"); + } + + create = ConfUtils.getBoolean(conf, IndexerBolt.OSCreateParamName, false); + pipeline = ConfUtils.getString(conf, IndexerBolt.OSIndexPipelineParamName); + + try { + connection = OpenSearchConnection.getConnection(conf, OSBoltType, this); + } catch (Exception e1) { + LOG.error("Can't connect to opensearch", e1); + throw new RuntimeException(e1); + } + + this.eventCounter = context.registerMetric("OpensearchIndexer", new MultiCountMetric(), 10); + + this.perSecMetrics = + context.registerMetric( + "Indexer_average_persec", + new MultiReducedMetric(new PerSecondReducer()), + 10); + + waitAck = + Caffeine.newBuilder() + .expireAfterWrite(60, TimeUnit.SECONDS) + .removalListener(this) + .build(); + + context.registerMetric("waitAck", () -> waitAck.estimatedSize(), 10); + + // use the default status schema if none has been specified + try { + IndexCreation.checkOrCreateIndex(connection.getClient(), indexName, OSBoltType, LOG); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public void onRemoval( + @Nullable String key, @Nullable List value, @NotNull RemovalCause cause) { + if (!cause.wasEvicted()) { + return; + } + if (value != null) { + LOG.error("Purged from waitAck {} with {} values", key, value.size()); + for (Tuple t : value) { + _collector.fail(t); + } + } else { + // This should never happen, but log it anyway. + LOG.error("Purged from waitAck {} with no values", key); + } + } + + @Override + public void cleanup() { + if (connection != null) { + connection.close(); + } + } + + @Override + public void execute(Tuple tuple) { + + final String url = tuple.getStringByField("url"); + + // Distinguish the value used for indexing + // from the one used for the status + final String normalisedurl = valueForURL(tuple); + + LOG.info("Indexing {} as {}", url, normalisedurl); + + final Metadata metadata = (Metadata) tuple.getValueByField("metadata"); + + if (!filterDocument(metadata)) { + LOG.info("Filtered {}", url); + eventCounter.scope("Filtered").incrBy(1); + // treat it as successfully processed even if + // we do not index it + _collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.FETCHED)); + _collector.ack(tuple); + return; + } + + final String docID = getDocumentID(metadata, normalisedurl); + + try { + final XContentBuilder builder = jsonBuilder().startObject(); + + // display text of the document? + if (StringUtils.isNotBlank(fieldNameForText())) { + final String text = trimText(tuple.getStringByField("text")); + if (!ignoreEmptyFields() || StringUtils.isNotBlank(text)) { + builder.field(fieldNameForText(), trimText(text)); + } + } + + // send URL as field? + if (StringUtils.isNotBlank(fieldNameForURL())) { + builder.field(fieldNameForURL(), normalisedurl); + } + + // which metadata to display? + final Map keyVals = filterMetadata(metadata); + + for (Entry entry : keyVals.entrySet()) { + if (entry.getValue().length == 1) { + final String value = entry.getValue()[0]; + if (!ignoreEmptyFields() || StringUtils.isNotBlank(value)) { + builder.field(entry.getKey(), value); + } + } else if (entry.getValue().length > 1) { + builder.array(entry.getKey(), entry.getValue()); + } + } + + builder.endObject(); + + final IndexRequest indexRequest = + new IndexRequest(getIndexName(metadata)) + .source(builder) + .id(docID) + .create(create); + + if (pipeline != null) { + indexRequest.setPipeline(pipeline); + } + + connection.addToProcessor(indexRequest); + + eventCounter.scope("Indexed").incrBy(1); + perSecMetrics.scope("Indexed").update(1); + + waitAckLock.lock(); + try { + List tt = waitAck.getIfPresent(docID); + if (tt == null) { + tt = new LinkedList<>(); + waitAck.put(docID, tt); + } + tt.add(tuple); + LOG.debug("Added to waitAck {} with ID {} total {}", url, docID, tt.size()); + } finally { + waitAckLock.unlock(); + } + } catch (IOException e) { + LOG.error("Error building document for OpenSearch", e); + // do not send to status stream so that it gets replayed + _collector.fail(tuple); + + waitAckLock.lock(); + try { + waitAck.invalidate(docID); + } finally { + waitAckLock.unlock(); + } + } + } + + /** + * Must be overridden for implementing custom index names based on some metadata information By + * Default, indexName coming from config is used + */ + protected String getIndexName(Metadata m) { + return indexName; + } + + @Override + public void beforeBulk(long executionId, BulkRequest request) { + eventCounter.scope("bulks_sent").incrBy(1); + } + + @Override + public void afterBulk(long executionId, BulkRequest request, BulkResponse response) { + eventCounter.scope("bulks_received").incrBy(1); + eventCounter.scope("bulk_msec").incrBy(response.getTook().getMillis()); + + var idsToBulkItemsWithFailedFlag = + Arrays.stream(response.getItems()) + .map( + bir -> { + String id = bir.getId(); + BulkItemResponse.Failure f = bir.getFailure(); + boolean failed = false; + if (f != null) { + if (f.getStatus().equals(RestStatus.CONFLICT)) { + eventCounter.scope("doc_conflicts").incrBy(1); + LOG.debug("Doc conflict ID {}", id); + } else { + failed = true; + } + } + return new BulkItemResponseToFailedFlag(bir, failed); + }) + .collect( + // https://github.com/apache/stormcrawler/issues/832 + Collectors.groupingBy( + idWithFailedFlagTuple -> idWithFailedFlagTuple.id, + Collectors.toUnmodifiableList())); + + Map> presentTuples; + long estimatedSize; + Set debugInfo = null; + waitAckLock.lock(); + try { + presentTuples = waitAck.getAllPresent(idsToBulkItemsWithFailedFlag.keySet()); + if (!presentTuples.isEmpty()) { + waitAck.invalidateAll(presentTuples.keySet()); + } + estimatedSize = waitAck.estimatedSize(); + // Only if we have to. + if (LOG.isDebugEnabled() && estimatedSize > 0L) { + debugInfo = new HashSet<>(waitAck.asMap().keySet()); + } + } finally { + waitAckLock.unlock(); + } + + int ackCount = 0; + int failureCount = 0; + + for (var entry : presentTuples.entrySet()) { + final var id = entry.getKey(); + final var associatedTuple = entry.getValue(); + final var bulkItemsWithFailedFlag = idsToBulkItemsWithFailedFlag.get(id); + + BulkItemResponseToFailedFlag selected; + + if (bulkItemsWithFailedFlag.size() == 1) { + selected = bulkItemsWithFailedFlag.get(0); + } else { + // Fallback if there are multiple responses for the same id + BulkItemResponseToFailedFlag tmp = null; + var ctFailed = 0; + for (var buwff : bulkItemsWithFailedFlag) { + if (tmp == null) { + tmp = buwff; + } + if (buwff.failed) { + ctFailed++; + } else { + tmp = buwff; + } + } + if (ctFailed != bulkItemsWithFailedFlag.size()) { + LOG.warn( + "The id {} would result in an ack and a failure. Using only the ack for processing.", + id); + } + selected = Objects.requireNonNull(tmp); + } + + if (associatedTuple != null) { + LOG.debug("Found {} tuple(s) for ID {}", associatedTuple.size(), id); + for (Tuple t : associatedTuple) { + String url = (String) t.getValueByField("url"); + + Metadata metadata = (Metadata) t.getValueByField("metadata"); + + if (!selected.failed) { + ackCount++; + _collector.emit( + StatusStreamName, t, new Values(url, metadata, Status.FETCHED)); + _collector.ack(t); + } else { + failureCount++; + var failure = selected.getFailure(); + LOG.error("update ID {}, URL {}, failure: {}", id, url, failure); + // there is something wrong with the content we should + // treat + // it as an ERROR + if (selected.getFailure().getStatus().equals(RestStatus.BAD_REQUEST)) { + metadata.setValue(Constants.STATUS_ERROR_SOURCE, "OpenSearch indexing"); + metadata.setValue(Constants.STATUS_ERROR_MESSAGE, "invalid content"); + _collector.emit( + StatusStreamName, t, new Values(url, metadata, Status.ERROR)); + _collector.ack(t); + LOG.debug("Acked {} with ID {}", url, id); + } else { + LOG.error("update ID {}, URL {}, failure: {}", id, url, failure); + // there is something wrong with the content we + // should + // treat + // it as an ERROR + if (failure.getStatus().equals(RestStatus.BAD_REQUEST)) { + metadata.setValue( + Constants.STATUS_ERROR_SOURCE, "OpenSearch indexing"); + metadata.setValue( + Constants.STATUS_ERROR_MESSAGE, "invalid content"); + _collector.emit( + StatusStreamName, + t, + new Values(url, metadata, Status.ERROR)); + _collector.ack(t); + } else { + // otherwise just fail it + _collector.fail(t); + } + } + } + } + } else { + LOG.warn("Could not find unacked tuples for {}", entry.getKey()); + } + } + + LOG.info( + "Bulk response [{}] : items {}, waitAck {}, acked {}, failed {}", + executionId, + idsToBulkItemsWithFailedFlag.size(), + estimatedSize, + ackCount, + failureCount); + if (debugInfo != null) { + for (String kinaw : debugInfo) { + LOG.debug("Still in wait ack after bulk response [{}] => {}", executionId, kinaw); + } + } + } + + @Override + public void afterBulk(long executionId, BulkRequest request, Throwable failure) { + eventCounter.scope("bulks_received").incrBy(1); + LOG.error("Exception with bulk {} - failing the whole lot ", executionId, failure); + + final var failedIds = + request.requests().stream() + .map(DocWriteRequest::id) + .collect(Collectors.toUnmodifiableSet()); + Map> failedTupleLists; + waitAckLock.lock(); + try { + failedTupleLists = waitAck.getAllPresent(failedIds); + if (!failedTupleLists.isEmpty()) { + waitAck.invalidateAll(failedTupleLists.keySet()); + } + } finally { + waitAckLock.unlock(); + } + + for (var id : failedIds) { + var failedTuples = failedTupleLists.get(id); + if (failedTuples != null) { + LOG.debug("Failed {} tuple(s) for ID {}", failedTuples.size(), id); + for (Tuple x : failedTuples) { + // fail it + eventCounter.scope("failed").incrBy(1); + _collector.fail(x); + } + } else { + LOG.warn("Could not find unacked tuple for {}", id); + } + } + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/filtering/JSONURLFilterWrapper.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/filtering/JSONURLFilterWrapper.java new file mode 100644 index 000000000..900223fa0 --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/filtering/JSONURLFilterWrapper.java @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.filtering; + +import com.fasterxml.jackson.databind.JsonNode; +import java.io.ByteArrayInputStream; +import java.net.URL; +import java.util.Map; +import java.util.Timer; +import java.util.TimerTask; +import org.apache.stormcrawler.JSONResource; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.filtering.URLFilter; +import org.apache.stormcrawler.opensearch.OpenSearchConnection; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; +import org.opensearch.action.get.GetRequest; +import org.opensearch.action.get.GetResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Wraps a URLFilter whose resources are in a JSON file that can be stored in OpenSearch. The + * benefit of doing this is that the resources can be refreshed automatically and modified without + * having to recompile the jar and restart the topology. The connection to OpenSearch is done via + * the config and uses a new bolt type 'config'. + * + *

The configuration of the delegate is done in the urlfilters.json as usual. + * + *

+ *  {
+ *     "class": "org.apache.stormcrawler.elasticsearch.filtering.JSONURLFilterWrapper",
+ *     "name": "ESFastURLFilter",
+ *     "params": {
+ *         "refresh": "60",
+ *         "delegate": {
+ *             "class": "org.apache.stormcrawler.filtering.regex.FastURLFilter",
+ *             "params": {
+ *                 "file": "fast.urlfilter.json"
+ *             }
+ *         }
+ *     }
+ *  }
+ * 
+ * + * The resource file can be pushed to OpenSearch with + * + *
+ *  curl -XPUT 'localhost:9200/config/config/fast.urlfilter.json?pretty' -H 'Content-Type: application/json' -d @fast.urlfilter.json
+ * 
+ */ +public class JSONURLFilterWrapper extends URLFilter { + + private static final Logger LOG = LoggerFactory.getLogger(JSONURLFilterWrapper.class); + + private URLFilter delegatedURLFilter; + + public void configure(@NotNull Map stormConf, @NotNull JsonNode filterParams) { + + String urlfilterclass = null; + + JsonNode delegateNode = filterParams.get("delegate"); + if (delegateNode == null) { + throw new RuntimeException("delegateNode undefined!"); + } + + JsonNode node = delegateNode.get("class"); + if (node != null && node.isTextual()) { + urlfilterclass = node.asText(); + } + + if (urlfilterclass == null) { + throw new RuntimeException("urlfilter.class undefined!"); + } + + // load an instance of the delegated parsefilter + try { + Class filterClass = Class.forName(urlfilterclass); + + boolean subClassOK = URLFilter.class.isAssignableFrom(filterClass); + if (!subClassOK) { + throw new RuntimeException( + "Filter " + urlfilterclass + " does not extend URLFilter"); + } + + delegatedURLFilter = (URLFilter) filterClass.getDeclaredConstructor().newInstance(); + + // check that it implements JSONResource + if (!JSONResource.class.isInstance(delegatedURLFilter)) { + throw new RuntimeException( + "Filter " + urlfilterclass + " does not implement JSONResource"); + } + + } catch (Exception e) { + LOG.error("Can't setup {}: {}", urlfilterclass, e); + throw new RuntimeException("Can't setup " + urlfilterclass, e); + } + + // configure it + node = delegateNode.get("params"); + + delegatedURLFilter.configure(stormConf, node); + + int refreshRate = 600; + + node = filterParams.get("refresh"); + if (node != null && node.isInt()) { + refreshRate = node.asInt(refreshRate); + } + + final JSONResource resource = (JSONResource) delegatedURLFilter; + + new Timer() + .schedule( + new TimerTask() { + private RestHighLevelClient osClient; + + public void run() { + if (osClient == null) { + try { + osClient = + OpenSearchConnection.getClient(stormConf, "config"); + } catch (Exception e) { + LOG.error( + "Exception while creating OpenSearch connection", + e); + } + } + if (osClient != null) { + LOG.info("Reloading json resources from OpenSearch"); + try { + GetResponse response = + osClient.get( + new GetRequest( + "config", + resource.getResourceFile()), + RequestOptions.DEFAULT); + resource.loadJSONResources( + new ByteArrayInputStream( + response.getSourceAsBytes())); + } catch (Exception e) { + LOG.error("Can't load config from OpenSearch", e); + } + } + } + }, + 0, + refreshRate * 1000); + } + + @Override + public @Nullable String filter( + @Nullable URL sourceUrl, + @Nullable Metadata sourceMetadata, + @NotNull String urlToFilter) { + return delegatedURLFilter.filter(sourceUrl, sourceMetadata, urlToFilter); + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java new file mode 100644 index 000000000..6b9ccf4cb --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.metrics; + +import static org.opensearch.common.xcontent.XContentFactory.jsonBuilder; + +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.Collection; +import java.util.Date; +import java.util.Locale; +import java.util.Map; +import java.util.Map.Entry; +import org.apache.storm.metric.api.IMetricsConsumer; +import org.apache.storm.task.IErrorReporter; +import org.apache.storm.task.TopologyContext; +import org.apache.stormcrawler.opensearch.IndexCreation; +import org.apache.stormcrawler.opensearch.OpenSearchConnection; +import org.apache.stormcrawler.util.ConfUtils; +import org.opensearch.action.index.IndexRequest; +import org.opensearch.core.xcontent.XContentBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Sends metrics to an OpenSearch index. The OpenSearch details are set in the configuration; an + * optional argument sets a date format to append to the index name. + * + *
+ *   topology.metrics.consumer.register:
+ *        - class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer"
+ *          parallelism.hint: 1
+ *          argument: "yyyy-MM-dd"
+ * 
+ */ +public class MetricsConsumer implements IMetricsConsumer { + + private final Logger LOG = LoggerFactory.getLogger(getClass()); + + private static final String OSBoltType = "metrics"; + + /** name of the index to use for the metrics (default : metrics) * */ + private static final String OSMetricsIndexNameParamName = + "opensearch." + OSBoltType + ".index.name"; + + private String indexName; + + private OpenSearchConnection connection; + + private String stormID; + + /** optional date format passed as argument, must be parsable as a SimpleDateFormat */ + private SimpleDateFormat dateFormat; + + @Override + public void prepare( + Map stormConf, + Object registrationArgument, + TopologyContext context, + IErrorReporter errorReporter) { + indexName = ConfUtils.getString(stormConf, OSMetricsIndexNameParamName, "metrics"); + stormID = context.getStormId(); + if (registrationArgument != null) { + dateFormat = new SimpleDateFormat((String) registrationArgument, Locale.ROOT); + LOG.info("Using date format {}", registrationArgument); + } + try { + connection = OpenSearchConnection.getConnection(stormConf, OSBoltType); + } catch (Exception e1) { + LOG.error("Can't connect to OpenSearch", e1); + throw new RuntimeException(e1); + } + + // create a template if it doesn't exist + try { + IndexCreation.checkOrCreateIndexTemplate(connection.getClient(), OSBoltType, LOG); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public void cleanup() { + if (connection != null) { + connection.close(); + } + } + + @Override + public void handleDataPoints(TaskInfo taskInfo, Collection dataPoints) { + final Date now = new Date(); + for (DataPoint dataPoint : dataPoints) { + handleDataPoints(taskInfo, dataPoint.name, dataPoint.value, now); + } + } + + private void handleDataPoints( + final TaskInfo taskInfo, final String nameprefix, final Object value, final Date now) { + if (value instanceof Number) { + indexDataPoint(taskInfo, now, nameprefix, ((Number) value).doubleValue()); + } else if (value instanceof Map) { + for (Entry entry : ((Map) value).entrySet()) { + String newnameprefix = nameprefix + "." + entry.getKey(); + handleDataPoints(taskInfo, newnameprefix, entry.getValue(), now); + } + } else if (value instanceof Collection) { + for (Object collectionObj : (Collection) value) { + handleDataPoints(taskInfo, nameprefix, collectionObj, now); + } + } else { + LOG.warn("Found data point value {} of {}", nameprefix, value.getClass().toString()); + } + } + + /** + * Returns the name of the index that metrics will be written to. + * + * @return elastic index name + */ + private String getIndexName(Date timestamp) { + if (dateFormat == null) { + return indexName; + } + + StringBuilder sb = new StringBuilder(indexName); + sb.append("-").append(dateFormat.format(timestamp)); + return sb.toString(); + } + + private void indexDataPoint(TaskInfo taskInfo, Date timestamp, String name, double value) { + try { + XContentBuilder builder = jsonBuilder().startObject(); + builder.field("stormId", stormID); + builder.field("srcComponentId", taskInfo.srcComponentId); + builder.field("srcTaskId", taskInfo.srcTaskId); + builder.field("srcWorkerHost", taskInfo.srcWorkerHost); + builder.field("srcWorkerPort", taskInfo.srcWorkerPort); + builder.field("name", name); + builder.field("value", value); + builder.field("timestamp", timestamp); + builder.endObject(); + + IndexRequest indexRequest = new IndexRequest(getIndexName(timestamp)).source(builder); + connection.addToProcessor(indexRequest); + } catch (Exception e) { + LOG.error("problem when building request for OpenSearch", e); + } + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java new file mode 100644 index 000000000..56edf6967 --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.metrics; + +import java.util.HashMap; +import java.util.Map; +import org.apache.storm.Config; +import org.apache.storm.task.OutputCollector; +import org.apache.storm.task.TopologyContext; +import org.apache.storm.topology.OutputFieldsDeclarer; +import org.apache.storm.topology.base.BaseRichBolt; +import org.apache.storm.tuple.Tuple; +import org.apache.storm.utils.TupleUtils; +import org.apache.stormcrawler.opensearch.Constants; +import org.apache.stormcrawler.opensearch.OpenSearchConnection; +import org.apache.stormcrawler.util.ConfUtils; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.core.CountRequest; +import org.opensearch.client.core.CountResponse; +import org.opensearch.core.action.ActionListener; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Queries the status index periodically to get the count of URLs per status. This bolt can be + * connected to the output of any other bolt and will not produce anything as output. + */ +public class StatusMetricsBolt extends BaseRichBolt { + + private static final Logger LOG = LoggerFactory.getLogger(StatusMetricsBolt.class); + + private static final String OSBoltType = "status"; + private static final String OSStatusIndexNameParamName = + Constants.PARAMPREFIX + "status.index.name"; + + private String indexName; + + private OpenSearchConnection connection; + + private Map latestStatusCounts = new HashMap<>(6); + + private int freqStats = 60; + + private OutputCollector _collector; + + private transient StatusActionListener[] listeners; + + private class StatusActionListener implements ActionListener { + + private final String name; + + private boolean ready = true; + + public boolean isReady() { + return ready; + } + + public void busy() { + this.ready = false; + } + + StatusActionListener(String statusName) { + name = statusName; + } + + @Override + public void onResponse(CountResponse response) { + ready = true; + LOG.debug("Got {} counts for status:{}", response.getCount(), name); + latestStatusCounts.put(name, response.getCount()); + } + + @Override + public void onFailure(Exception e) { + ready = true; + LOG.error("Failure when getting counts for status:{}", name, e); + } + } + + @Override + public void prepare( + Map stormConf, TopologyContext context, OutputCollector collector) { + _collector = collector; + indexName = ConfUtils.getString(stormConf, OSStatusIndexNameParamName, "status"); + try { + connection = OpenSearchConnection.getConnection(stormConf, OSBoltType); + } catch (Exception e1) { + LOG.error("Can't connect to ElasticSearch", e1); + throw new RuntimeException(e1); + } + + context.registerMetric( + "status.count", + () -> { + return latestStatusCounts; + }, + freqStats); + + listeners = new StatusActionListener[6]; + + listeners[0] = new StatusActionListener("DISCOVERED"); + listeners[1] = new StatusActionListener("FETCHED"); + listeners[2] = new StatusActionListener("FETCH_ERROR"); + listeners[3] = new StatusActionListener("REDIRECTION"); + listeners[4] = new StatusActionListener("ERROR"); + listeners[5] = new StatusActionListener("TOTAL"); + } + + @Override + public Map getComponentConfiguration() { + Config conf = new Config(); + conf.put(Config.TOPOLOGY_TICK_TUPLE_FREQ_SECS, freqStats); + return conf; + } + + @Override + public void execute(Tuple input) { + _collector.ack(input); + + // this bolt can be connected to anything + // we just want to trigger a new search when the input is a tick tuple + if (!TupleUtils.isTick(input)) { + return; + } + + for (StatusActionListener listener : listeners) { + // still waiting for results from previous request + if (!listener.isReady()) { + LOG.debug("Not ready to get counts for status {}", listener.name); + continue; + } + CountRequest request = new CountRequest(indexName); + if (!listener.name.equalsIgnoreCase("TOTAL")) { + SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); + sourceBuilder.query(QueryBuilders.termQuery("status", listener.name)); + request.source(sourceBuilder); + } + listener.busy(); + connection.getClient().countAsync(request, RequestOptions.DEFAULT, listener); + } + } + + @Override + public void cleanup() { + connection.close(); + } + + @Override + public void declareOutputFields(OutputFieldsDeclarer declarer) { + // NONE - THIS BOLT DOES NOT GET CONNECTED TO ANY OTHERS + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java new file mode 100644 index 000000000..e475afb2e --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.parse.filter; + +import com.fasterxml.jackson.databind.JsonNode; +import java.io.ByteArrayInputStream; +import java.util.Map; +import java.util.Timer; +import java.util.TimerTask; +import org.apache.stormcrawler.JSONResource; +import org.apache.stormcrawler.opensearch.OpenSearchConnection; +import org.apache.stormcrawler.parse.ParseFilter; +import org.apache.stormcrawler.parse.ParseResult; +import org.jetbrains.annotations.NotNull; +import org.opensearch.action.get.GetRequest; +import org.opensearch.action.get.GetResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.DocumentFragment; + +/** + * Wraps a ParseFilter whose resources are in a JSON file that can be stored in OpenSearch. The + * benefit of doing this is that the resources can be refreshed automatically and modified without + * having to recompile the jar and restart the topology. The connection to OpenSearch is done via + * the config and uses a new bolt type 'config'. + * + *

The configuration of the delegate is done in the parsefilters.json as usual. + * + *

+ *  {
+ *     "class": "org.apache.stormcrawler.elasticsearch.parse.filter.JSONResourceWrapper",
+ *     "name": "OpenSearchCollectionTagger",
+ *     "params": {
+ *         "refresh": "60",
+ *         "delegate": {
+ *             "class": "org.apache.stormcrawler.parse.filter.CollectionTagger",
+ *             "params": {
+ *                 "file": "collections.json"
+ *             }
+ *         }
+ *     }
+ *  }
+ * 
+ * + * The resource file can be pushed to OpenSearch with + * + *
+ *  curl -XPUT "$OSHOST/config/_create/collections.json" -H 'Content-Type: application/json' -d @src/main/resources/collections.json
+ * 
+ */ +public class JSONResourceWrapper extends ParseFilter { + + private static final Logger LOG = LoggerFactory.getLogger(JSONResourceWrapper.class); + + private ParseFilter delegatedParseFilter; + + public void configure(@NotNull Map stormConf, @NotNull JsonNode filterParams) { + + String parsefilterclass = null; + + JsonNode delegateNode = filterParams.get("delegate"); + if (delegateNode == null) { + throw new RuntimeException("delegateNode undefined!"); + } + + JsonNode node = delegateNode.get("class"); + if (node != null && node.isTextual()) { + parsefilterclass = node.asText(); + } + + if (parsefilterclass == null) { + throw new RuntimeException("parsefilter.class undefined!"); + } + + // load an instance of the delegated parsefilter + try { + Class filterClass = Class.forName(parsefilterclass); + + boolean subClassOK = ParseFilter.class.isAssignableFrom(filterClass); + if (!subClassOK) { + throw new RuntimeException( + "Filter " + parsefilterclass + " does not extend ParseFilter"); + } + + delegatedParseFilter = (ParseFilter) filterClass.getDeclaredConstructor().newInstance(); + + // check that it implements JSONResource + if (!JSONResource.class.isInstance(delegatedParseFilter)) { + throw new RuntimeException( + "Filter " + parsefilterclass + " does not implement JSONResource"); + } + + } catch (Exception e) { + LOG.error("Can't setup {}: {}", parsefilterclass, e); + throw new RuntimeException("Can't setup " + parsefilterclass, e); + } + + // configure it + node = delegateNode.get("params"); + + delegatedParseFilter.configure(stormConf, node); + + int refreshRate = 600; + + node = filterParams.get("refresh"); + if (node != null && node.isInt()) { + refreshRate = node.asInt(refreshRate); + } + + final JSONResource resource = (JSONResource) delegatedParseFilter; + + new Timer() + .schedule( + new TimerTask() { + private RestHighLevelClient esClient; + + public void run() { + if (esClient == null) { + try { + esClient = + OpenSearchConnection.getClient(stormConf, "config"); + } catch (Exception e) { + LOG.error( + "Exception while creating OpenSearch connection", + e); + } + } + if (esClient != null) { + LOG.info("Reloading json resources from OpenSearch"); + try { + GetResponse response = + esClient.get( + new GetRequest( + "config", + resource.getResourceFile()), + RequestOptions.DEFAULT); + resource.loadJSONResources( + new ByteArrayInputStream( + response.getSourceAsBytes())); + } catch (Exception e) { + LOG.error("Can't load config from OpenSearch", e); + } + } + } + }, + 0, + refreshRate * 1000); + } + + @Override + public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) { + delegatedParseFilter.filter(URL, content, doc, parse); + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AbstractSpout.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AbstractSpout.java new file mode 100644 index 000000000..43b0e4289 --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AbstractSpout.java @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.persistence; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import org.apache.storm.spout.SpoutOutputCollector; +import org.apache.storm.task.TopologyContext; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.opensearch.Constants; +import org.apache.stormcrawler.opensearch.IndexCreation; +import org.apache.stormcrawler.opensearch.OpenSearchConnection; +import org.apache.stormcrawler.persistence.AbstractQueryingSpout; +import org.apache.stormcrawler.util.ConfUtils; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.search.SearchHit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class AbstractSpout extends AbstractQueryingSpout { + + private static final Logger LOG = LoggerFactory.getLogger(AbstractSpout.class); + + protected static final String OSBoltType = "status"; + protected static final String OSStatusIndexNameParamName = + Constants.PARAMPREFIX + OSBoltType + ".index.name"; + + /** Field name to use for aggregating * */ + protected static final String OSStatusBucketFieldParamName = + Constants.PARAMPREFIX + OSBoltType + ".bucket.field"; + + protected static final String OSStatusMaxBucketParamName = + Constants.PARAMPREFIX + OSBoltType + ".max.buckets"; + protected static final String OSStatusMaxURLsParamName = + Constants.PARAMPREFIX + OSBoltType + ".max.urls.per.bucket"; + + /** Field name to use for sorting the URLs within a bucket, not used if empty or null. */ + protected static final String OSStatusBucketSortFieldParamName = + Constants.PARAMPREFIX + OSBoltType + ".bucket.sort.field"; + + /** Field name to use for sorting the buckets, not used if empty or null. */ + protected static final String OSStatusGlobalSortFieldParamName = + Constants.PARAMPREFIX + OSBoltType + ".global.sort.field"; + + protected static final String OSStatusFilterParamName = + Constants.PARAMPREFIX + OSBoltType + ".filterQuery"; + + protected static final String OSStatusQueryTimeoutParamName = + Constants.PARAMPREFIX + OSBoltType + ".query.timeout"; + + /** Query to use as a positive filter, set by es.status.filterQuery */ + protected List filterQueries = null; + + protected String indexName; + + protected static RestHighLevelClient client; + + /** + * when using multiple instances - each one is in charge of a specific shard useful when + * sharding based on host or domain to guarantee a good mix of URLs + */ + protected int shardID = -1; + + /** Used to distinguish between instances in the logs * */ + protected String logIdprefix = ""; + + /** Field name used for field collapsing e.g. key * */ + protected String partitionField; + + protected int maxURLsPerBucket = 10; + + protected int maxBucketNum = 10; + + protected List bucketSortField = new ArrayList<>(); + + protected String totalSortField = ""; + + protected Date queryDate; + + protected int queryTimeout = -1; + + @Override + public void open( + Map stormConf, + TopologyContext context, + SpoutOutputCollector collector) { + + super.open(stormConf, context, collector); + + indexName = ConfUtils.getString(stormConf, OSStatusIndexNameParamName, "status"); + + // one OS client per JVM + synchronized (AbstractSpout.class) { + try { + if (client == null) { + client = OpenSearchConnection.getClient(stormConf, OSBoltType); + } + } catch (Exception e1) { + LOG.error("Can't connect to ElasticSearch", e1); + throw new RuntimeException(e1); + } + + // use the default status schema if none has been specified + try { + IndexCreation.checkOrCreateIndex(client, indexName, OSBoltType, LOG); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + // if more than one instance is used we expect their number to be the + // same as the number of shards + int totalTasks = context.getComponentTasks(context.getThisComponentId()).size(); + if (totalTasks > 1) { + logIdprefix = + "[" + context.getThisComponentId() + " #" + context.getThisTaskIndex() + "] "; + + // determine the number of shards so that we can restrict the + // search + + // TODO use the admin API when it gets available + // TODO or the low level one with + // https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-shards-stores.html + // TODO identify local shards and use those if possible + + // ClusterSearchShardsRequest request = new + // ClusterSearchShardsRequest( + // indexName); + // ClusterSearchShardsResponse shardresponse = client.admin() + // .cluster().searchShards(request).actionGet(); + // ClusterSearchShardsGroup[] shardgroups = + // shardresponse.getGroups(); + // if (totalTasks != shardgroups.length) { + // throw new RuntimeException( + // "Number of OS spout instances should be the same as number of + // shards (" + // + shardgroups.length + ") but is " + totalTasks); + // } + // shardID = shardgroups[context.getThisTaskIndex()].getShardId() + // .getId(); + + // TEMPORARY simply use the task index as shard index + shardID = context.getThisTaskIndex(); + LOG.info("{} assigned shard ID {}", logIdprefix, shardID); + } + + partitionField = ConfUtils.getString(stormConf, OSStatusBucketFieldParamName, "key"); + + bucketSortField = ConfUtils.loadListFromConf(OSStatusBucketSortFieldParamName, stormConf); + + totalSortField = ConfUtils.getString(stormConf, OSStatusGlobalSortFieldParamName); + + maxURLsPerBucket = ConfUtils.getInt(stormConf, OSStatusMaxURLsParamName, 1); + maxBucketNum = ConfUtils.getInt(stormConf, OSStatusMaxBucketParamName, 10); + + queryTimeout = ConfUtils.getInt(stormConf, OSStatusQueryTimeoutParamName, -1); + + filterQueries = ConfUtils.loadListFromConf(OSStatusFilterParamName, stormConf); + } + + /** Builds a query and use it retrieve the results from OS * */ + protected abstract void populateBuffer(); + + protected final boolean addHitToBuffer(SearchHit hit) { + Map keyValues = hit.getSourceAsMap(); + String url = (String) keyValues.get("url"); + // is already being processed - skip it! + if (beingProcessed.containsKey(url)) { + return false; + } + return buffer.add(url, fromKeyValues(keyValues)); + } + + protected final Metadata fromKeyValues(Map keyValues) { + Map> mdAsMap = (Map>) keyValues.get("metadata"); + Metadata metadata = new Metadata(); + if (mdAsMap != null) { + for (Entry> mdEntry : mdAsMap.entrySet()) { + String key = mdEntry.getKey(); + // periods are not allowed - replace with %2E + key = key.replaceAll("%2E", "\\."); + Object mdValObj = mdEntry.getValue(); + // single value + if (mdValObj instanceof String) { + metadata.addValue(key, (String) mdValObj); + } else { + // multi valued + metadata.addValues(key, (List) mdValObj); + } + } + } + return metadata; + } + + @Override + public void ack(Object msgId) { + LOG.debug("{} Ack for {}", logIdprefix, msgId); + super.ack(msgId); + } + + @Override + public void fail(Object msgId) { + LOG.info("{} Fail for {}", logIdprefix, msgId); + super.fail(msgId); + } + + @Override + public void close() { + if (client != null) { + try { + client.close(); + } catch (IOException e) { + LOG.error("Exception caught when closing client", e); + } + } + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java new file mode 100644 index 000000000..2eb97102f --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java @@ -0,0 +1,373 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.persistence; + +import static org.opensearch.index.query.QueryBuilders.boolQuery; + +import java.time.Instant; +import java.util.Calendar; +import java.util.Date; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.TimeZone; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.StringUtils; +import org.apache.storm.spout.SpoutOutputCollector; +import org.apache.storm.task.TopologyContext; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.opensearch.Constants; +import org.apache.stormcrawler.util.ConfUtils; +import org.joda.time.format.ISODateTimeFormat; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.core.action.ActionListener; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.SearchHit; +import org.opensearch.search.aggregations.AggregationBuilders; +import org.opensearch.search.aggregations.Aggregations; +import org.opensearch.search.aggregations.BucketOrder; +import org.opensearch.search.aggregations.bucket.SingleBucketAggregation; +import org.opensearch.search.aggregations.bucket.sampler.DiversifiedAggregationBuilder; +import org.opensearch.search.aggregations.bucket.terms.Terms; +import org.opensearch.search.aggregations.bucket.terms.Terms.Bucket; +import org.opensearch.search.aggregations.bucket.terms.TermsAggregationBuilder; +import org.opensearch.search.aggregations.metrics.TopHits; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.sort.FieldSortBuilder; +import org.opensearch.search.sort.SortBuilders; +import org.opensearch.search.sort.SortOrder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Spout which pulls URL from an OpenSearch index. Use a single instance unless you use + * 'opensearch.status.routing' with the StatusUpdaterBolt, in which case you need to have exactly + * the same number of spout instances as OpenSearch shards. Guarantees a good mix of URLs by + * aggregating them by an arbitrary field e.g. key. + */ +public class AggregationSpout extends AbstractSpout implements ActionListener { + + private static final Logger LOG = LoggerFactory.getLogger(AggregationSpout.class); + + private static final String StatusSampleParamName = Constants.PARAMPREFIX + "status.sample"; + private static final String MostRecentDateIncreaseParamName = + Constants.PARAMPREFIX + "status.recentDate.increase"; + private static final String MostRecentDateMinGapParamName = + Constants.PARAMPREFIX + "status.recentDate.min.gap"; + + private boolean sample = false; + + private int recentDateIncrease = -1; + private int recentDateMinGap = -1; + + protected Set currentBuckets; + + @Override + public void open( + Map stormConf, + TopologyContext context, + SpoutOutputCollector collector) { + sample = ConfUtils.getBoolean(stormConf, StatusSampleParamName, sample); + recentDateIncrease = + ConfUtils.getInt(stormConf, MostRecentDateIncreaseParamName, recentDateIncrease); + recentDateMinGap = + ConfUtils.getInt(stormConf, MostRecentDateMinGapParamName, recentDateMinGap); + super.open(stormConf, context, collector); + currentBuckets = new HashSet<>(); + } + + @Override + protected void populateBuffer() { + + if (queryDate == null) { + queryDate = new Date(); + lastTimeResetToNow = Instant.now(); + } + + String formattedQueryDate = ISODateTimeFormat.dateTimeNoMillis().print(queryDate.getTime()); + + LOG.info("{} Populating buffer with nextFetchDate <= {}", logIdprefix, formattedQueryDate); + + BoolQueryBuilder queryBuilder = + boolQuery() + .filter(QueryBuilders.rangeQuery("nextFetchDate").lte(formattedQueryDate)); + + if (filterQueries != null) { + for (String filterQuery : filterQueries) { + queryBuilder.filter(QueryBuilders.queryStringQuery(filterQuery)); + } + } + + SearchRequest request = new SearchRequest(indexName); + + SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); + sourceBuilder.query(queryBuilder); + sourceBuilder.from(0); + sourceBuilder.size(0); + sourceBuilder.explain(false); + sourceBuilder.trackTotalHits(false); + + if (queryTimeout != -1) { + sourceBuilder.timeout( + new org.opensearch.common.unit.TimeValue(queryTimeout, TimeUnit.SECONDS)); + } + + TermsAggregationBuilder aggregations = + AggregationBuilders.terms("partition").field(partitionField).size(maxBucketNum); + + org.opensearch.search.aggregations.metrics.TopHitsAggregationBuilder tophits = + AggregationBuilders.topHits("docs").size(maxURLsPerBucket).explain(false); + + // sort within a bucket + for (String bsf : bucketSortField) { + FieldSortBuilder sorter = SortBuilders.fieldSort(bsf).order(SortOrder.ASC); + tophits.sort(sorter); + } + + aggregations.subAggregation(tophits); + + // sort between buckets + if (StringUtils.isNotBlank(totalSortField)) { + org.opensearch.search.aggregations.metrics.MinAggregationBuilder minBuilder = + AggregationBuilders.min("top_hit").field(totalSortField); + aggregations.subAggregation(minBuilder); + aggregations.order(BucketOrder.aggregation("top_hit", true)); + } + + if (sample) { + DiversifiedAggregationBuilder sab = new DiversifiedAggregationBuilder("sample"); + sab.field(partitionField).maxDocsPerValue(maxURLsPerBucket); + sab.shardSize(maxURLsPerBucket * maxBucketNum); + sab.subAggregation(aggregations); + sourceBuilder.aggregation(sab); + } else { + sourceBuilder.aggregation(aggregations); + } + + request.source(sourceBuilder); + + // https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-preference.html + // _shards:2,3 + // specific shard but ideally a local copy of it + if (shardID != -1) { + request.preference("_shards:" + shardID + "|_local"); + } + + // dump query to log + LOG.debug("{} OpenSearch query {}", logIdprefix, request); + + LOG.trace("{} isInquery set to true"); + isInQuery.set(true); + client.searchAsync(request, RequestOptions.DEFAULT, this); + } + + @Override + public void onFailure(Exception arg0) { + LOG.error("{} Exception with OpenSearch query", logIdprefix, arg0); + markQueryReceivedNow(); + } + + @Override + public void onResponse(SearchResponse response) { + long timeTaken = System.currentTimeMillis() - getTimeLastQuerySent(); + + Aggregations aggregs = response.getAggregations(); + + if (aggregs == null) { + markQueryReceivedNow(); + return; + } + + SingleBucketAggregation sample = aggregs.get("sample"); + if (sample != null) { + aggregs = sample.getAggregations(); + } + + Terms agg = aggregs.get("partition"); + + int numhits = 0; + int numBuckets = 0; + int alreadyprocessed = 0; + + Instant mostRecentDateFound = null; + + currentBuckets.clear(); + + // For each entry + Iterator iterator = (Iterator) agg.getBuckets().iterator(); + while (iterator.hasNext()) { + Terms.Bucket entry = iterator.next(); + String key = (String) entry.getKey(); // bucket key + + currentBuckets.add(key); + + long docCount = entry.getDocCount(); // Doc count + + int hitsForThisBucket = 0; + + SearchHit lastHit = null; + + // filter results so that we don't include URLs we are already + // being processed + TopHits topHits = entry.getAggregations().get("docs"); + for (SearchHit hit : topHits.getHits().getHits()) { + + LOG.debug( + "{} -> id [{}], _source [{}]", + logIdprefix, + hit.getId(), + hit.getSourceAsString()); + + hitsForThisBucket++; + + lastHit = hit; + + Map keyValues = hit.getSourceAsMap(); + String url = (String) keyValues.get("url"); + + // consider only the first document of the last bucket + // for optimising the nextFetchDate + if (hitsForThisBucket == 1 && !iterator.hasNext()) { + String strDate = (String) keyValues.get("nextFetchDate"); + try { + mostRecentDateFound = Instant.parse(strDate); + } catch (Exception e) { + throw new RuntimeException("can't parse date :" + strDate); + } + } + + // is already being processed or in buffer - skip it! + if (beingProcessed.containsKey(url)) { + LOG.debug("{} -> already processed: {}", logIdprefix, url); + alreadyprocessed++; + continue; + } + + Metadata metadata = fromKeyValues(keyValues); + boolean added = buffer.add(url, metadata); + if (!added) { + LOG.debug("{} -> already in buffer: {}", logIdprefix, url); + alreadyprocessed++; + continue; + } + LOG.debug("{} -> added to buffer : {}", logIdprefix, url); + } + + if (lastHit != null) { + sortValuesForKey(key, lastHit.getSortValues()); + } + + if (hitsForThisBucket > 0) { + numBuckets++; + } + + numhits += hitsForThisBucket; + + LOG.debug( + "{} key [{}], hits[{}], doc_count [{}]", + logIdprefix, + key, + hitsForThisBucket, + docCount, + alreadyprocessed); + } + + LOG.info( + "{} OpenSearch query returned {} hits from {} buckets in {} msec with {} already being processed. Took {} msec per doc on average.", + logIdprefix, + numhits, + numBuckets, + timeTaken, + alreadyprocessed, + ((float) timeTaken / numhits)); + + queryTimes.addMeasurement(timeTaken); + eventCounter.scope("already_being_processed").incrBy(alreadyprocessed); + eventCounter.scope("ES_queries").incrBy(1); + eventCounter.scope("ES_docs").incrBy(numhits); + + // optimise the nextFetchDate by getting the most recent value + // returned in the query and add to it, unless the previous value is + // within n mins in which case we'll keep it + if (mostRecentDateFound != null && recentDateIncrease >= 0) { + Calendar potentialNewDate = + Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT); + potentialNewDate.setTimeInMillis(mostRecentDateFound.toEpochMilli()); + potentialNewDate.add(Calendar.MINUTE, recentDateIncrease); + Date oldDate = null; + // check boundaries + if (this.recentDateMinGap > 0) { + Calendar low = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT); + low.setTime(queryDate); + low.add(Calendar.MINUTE, -recentDateMinGap); + Calendar high = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT); + high.setTime(queryDate); + high.add(Calendar.MINUTE, recentDateMinGap); + if (high.before(potentialNewDate) || low.after(potentialNewDate)) { + oldDate = queryDate; + } + } else { + oldDate = queryDate; + } + if (oldDate != null) { + queryDate = potentialNewDate.getTime(); + LOG.info( + "{} queryDate changed from {} to {} based on mostRecentDateFound {}", + logIdprefix, + oldDate, + queryDate, + mostRecentDateFound); + } else { + LOG.info( + "{} queryDate kept at {} based on mostRecentDateFound {}", + logIdprefix, + queryDate, + mostRecentDateFound); + } + } + + // reset the value for next fetch date if the previous one is too old + if (resetFetchDateAfterNSecs != -1) { + Instant changeNeededOn = + Instant.ofEpochMilli( + lastTimeResetToNow.toEpochMilli() + (resetFetchDateAfterNSecs * 1000L)); + if (Instant.now().isAfter(changeNeededOn)) { + LOG.info( + "{} queryDate set to null based on resetFetchDateAfterNSecs {}", + logIdprefix, + resetFetchDateAfterNSecs); + queryDate = null; + } + } + + // change the date if we don't get any results at all + if (numBuckets == 0) { + queryDate = null; + } + + // remove lock + markQueryReceivedNow(); + } + + protected void sortValuesForKey(String key, Object[] sortValues) {} +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/HybridSpout.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/HybridSpout.java new file mode 100644 index 000000000..551153f52 --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/HybridSpout.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.persistence; + +import static org.opensearch.index.query.QueryBuilders.boolQuery; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import java.time.Instant; +import java.util.Date; +import java.util.List; +import java.util.Map; +import org.apache.storm.spout.SpoutOutputCollector; +import org.apache.storm.task.TopologyContext; +import org.apache.stormcrawler.opensearch.Constants; +import org.apache.stormcrawler.persistence.EmptyQueueListener; +import org.apache.stormcrawler.util.ConfUtils; +import org.joda.time.format.ISODateTimeFormat; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.core.action.ActionListener; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.search.SearchHit; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.sort.FieldSortBuilder; +import org.opensearch.search.sort.SortBuilders; +import org.opensearch.search.sort.SortOrder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Uses collapsing spouts to get an initial set of URLs and keys to query for and gets emptyQueue + * notifications from the URLBuffer to query OpenSearch for a specific key. + * + * @since 1.15 + */ +public class HybridSpout extends AggregationSpout implements EmptyQueueListener { + + private static final Logger LOG = LoggerFactory.getLogger(HybridSpout.class); + + protected static final String RELOADPARAMNAME = + Constants.PARAMPREFIX + "status.max.urls.per.reload"; + + private int bufferReloadSize = 10; + + private Cache searchAfterCache; + + private HostResultListener hrl; + + @Override + public void open( + Map stormConf, + TopologyContext context, + SpoutOutputCollector collector) { + super.open(stormConf, context, collector); + bufferReloadSize = ConfUtils.getInt(stormConf, RELOADPARAMNAME, maxURLsPerBucket); + buffer.setEmptyQueueListener(this); + searchAfterCache = Caffeine.newBuilder().build(); + hrl = new HostResultListener(); + } + + @Override + public void emptyQueue(String queueName) { + + LOG.info("{} Emptied buffer queue for {}", logIdprefix, queueName); + + if (!currentBuckets.contains(queueName)) { + // not interested in this one any more + return; + } + + // reloading the aggregs - searching now + // would just overload OpenSearch and yield + // mainly duplicates + if (isInQuery.get()) { + LOG.trace("{} isInquery true", logIdprefix, queueName); + return; + } + + LOG.info("{} Querying for more docs for {}", logIdprefix, queueName); + + if (queryDate == null) { + queryDate = new Date(); + lastTimeResetToNow = Instant.now(); + } + + String formattedQueryDate = ISODateTimeFormat.dateTimeNoMillis().print(queryDate.getTime()); + + BoolQueryBuilder queryBuilder = + boolQuery() + .filter(QueryBuilders.rangeQuery("nextFetchDate").lte(formattedQueryDate)); + + queryBuilder.filter(QueryBuilders.termQuery(partitionField, queueName)); + + SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); + sourceBuilder.query(queryBuilder); + sourceBuilder.from(0); + sourceBuilder.size(bufferReloadSize); + sourceBuilder.explain(false); + sourceBuilder.trackTotalHits(false); + + // sort within a bucket + for (String bsf : bucketSortField) { + FieldSortBuilder sorter = SortBuilders.fieldSort(bsf).order(SortOrder.ASC); + sourceBuilder.sort(sorter); + } + + // do we have a search after for this one? + Object[] searchAfterValues = searchAfterCache.getIfPresent(queueName); + if (searchAfterValues != null) { + sourceBuilder.searchAfter(searchAfterValues); + } + + SearchRequest request = new SearchRequest(indexName); + + request.source(sourceBuilder); + + // https://www.elastic.co/guide/en/opensearch/reference/current/search-request-preference.html + // _shards:2,3 + // specific shard but ideally a local copy of it + if (shardID != -1) { + request.preference("_shards:" + shardID + "|_local"); + } + + // dump query to log + LOG.debug("{} OpenSearch query {} - {}", logIdprefix, queueName, request.toString()); + + client.searchAsync(request, RequestOptions.DEFAULT, hrl); + } + + /** Overrides the handling of responses for aggregations. */ + @Override + public void onResponse(SearchResponse response) { + // delete all entries from the searchAfterCache when + // we get the results from the aggregation spouts + searchAfterCache.invalidateAll(); + super.onResponse(response); + } + + /** The aggregation kindly told us where to start from. */ + @Override + protected void sortValuesForKey(String key, Object[] sortValues) { + if (sortValues != null && sortValues.length > 0) { + this.searchAfterCache.put(key, sortValues); + } + } + + /** Handling of results for a specific queue. */ + class HostResultListener implements ActionListener { + + @Override + public void onResponse(SearchResponse response) { + + int alreadyprocessed = 0; + int numDocs = 0; + + SearchHit[] hits = response.getHits().getHits(); + + Object[] sortValues = null; + + // retrieve the key for these results + String key = null; + + for (SearchHit hit : hits) { + numDocs++; + String pfield = partitionField; + Map sourceAsMap = hit.getSourceAsMap(); + if (pfield.startsWith("metadata.")) { + sourceAsMap = (Map) sourceAsMap.get("metadata"); + pfield = pfield.substring(9); + } + Object key_as_object = sourceAsMap.get(pfield); + if (key_as_object instanceof List) { + if (((List) (key_as_object)).size() == 1) { + key = ((List) key_as_object).get(0); + } + } else { + key = key_as_object.toString(); + } + + sortValues = hit.getSortValues(); + if (!addHitToBuffer(hit)) { + alreadyprocessed++; + } + } + + // no key if no results have been found + if (key != null) { + searchAfterCache.put(key, sortValues); + } + + eventCounter.scope("OpenSearch_queries_host").incrBy(1); + eventCounter.scope("OpenSearch_docs_host").incrBy(numDocs); + eventCounter.scope("already_being_processed_host").incrBy(alreadyprocessed); + + LOG.info( + "{} OpenSearch term query returned {} hits in {} msec with {} already being processed for {}", + logIdprefix, + numDocs, + response.getTook().getMillis(), + alreadyprocessed, + key); + } + + @Override + public void onFailure(Exception e) { + LOG.error("Exception with OpenSearch query", e); + } + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java new file mode 100644 index 000000000..bd178f7db --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java @@ -0,0 +1,490 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.persistence; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.RemovalCause; +import com.github.benmanes.caffeine.cache.RemovalListener; +import java.io.IOException; +import java.util.Arrays; +import java.util.Date; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.locks.ReentrantLock; +import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; +import org.apache.storm.metric.api.MultiCountMetric; +import org.apache.storm.metric.api.MultiReducedMetric; +import org.apache.storm.task.OutputCollector; +import org.apache.storm.task.TopologyContext; +import org.apache.storm.tuple.Tuple; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.opensearch.BulkItemResponseToFailedFlag; +import org.apache.stormcrawler.opensearch.Constants; +import org.apache.stormcrawler.opensearch.IndexCreation; +import org.apache.stormcrawler.opensearch.OpenSearchConnection; +import org.apache.stormcrawler.persistence.AbstractStatusUpdaterBolt; +import org.apache.stormcrawler.persistence.Status; +import org.apache.stormcrawler.util.ConfUtils; +import org.apache.stormcrawler.util.PerSecondReducer; +import org.apache.stormcrawler.util.URLPartitioner; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; +import org.opensearch.action.DocWriteRequest; +import org.opensearch.action.bulk.BulkItemResponse; +import org.opensearch.action.bulk.BulkProcessor; +import org.opensearch.action.bulk.BulkRequest; +import org.opensearch.action.bulk.BulkResponse; +import org.opensearch.action.index.IndexRequest; +import org.opensearch.common.xcontent.XContentFactory; +import org.opensearch.core.rest.RestStatus; +import org.opensearch.core.xcontent.XContentBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Simple bolt which stores the status of URLs into OpenSearch. Takes the tuples coming from the + * 'status' stream. To be used in combination with a Spout to read from the index. + */ +public class StatusUpdaterBolt extends AbstractStatusUpdaterBolt + implements RemovalListener>, BulkProcessor.Listener { + + private static final Logger LOG = LoggerFactory.getLogger(StatusUpdaterBolt.class); + + private String OSBoltType = "status"; + + private static final String OSStatusIndexNameParamName = + Constants.PARAMPREFIX + "%s.index.name"; + private static final String OSStatusRoutingParamName = Constants.PARAMPREFIX + "%s.routing"; + private static final String OSStatusRoutingFieldParamName = + Constants.PARAMPREFIX + "%s.routing.fieldname"; + + private boolean routingFieldNameInMetadata = false; + + private String indexName; + + private URLPartitioner partitioner; + + /** whether to apply the same partitioning logic used for politeness for routing, e.g byHost */ + private boolean doRouting; + + /** Store the key used for routing explicitly as a field in metadata * */ + private String fieldNameForRoutingKey = null; + + private OpenSearchConnection connection; + + private Cache> waitAck; + + // Be fair due to cache timeout + private final ReentrantLock waitAckLock = new ReentrantLock(true); + + private MultiCountMetric eventCounter; + + private MultiReducedMetric receivedPerSecMetrics; + + public StatusUpdaterBolt() { + super(); + } + + /** + * Loads the configuration using a substring different from the default value 'status' in order + * to distinguish it from the spout configurations + */ + public StatusUpdaterBolt(String boltType) { + super(); + OSBoltType = boltType; + } + + @Override + public void prepare( + Map stormConf, TopologyContext context, OutputCollector collector) { + + super.prepare(stormConf, context, collector); + + indexName = + ConfUtils.getString( + stormConf, + String.format( + Locale.ROOT, + StatusUpdaterBolt.OSStatusIndexNameParamName, + OSBoltType), + "status"); + + doRouting = + ConfUtils.getBoolean( + stormConf, + String.format( + Locale.ROOT, + StatusUpdaterBolt.OSStatusRoutingParamName, + OSBoltType), + false); + + partitioner = new URLPartitioner(); + partitioner.configure(stormConf); + + fieldNameForRoutingKey = + ConfUtils.getString( + stormConf, + String.format( + Locale.ROOT, + StatusUpdaterBolt.OSStatusRoutingFieldParamName, + OSBoltType)); + if (StringUtils.isNotBlank(fieldNameForRoutingKey)) { + if (fieldNameForRoutingKey.startsWith("metadata.")) { + routingFieldNameInMetadata = true; + fieldNameForRoutingKey = fieldNameForRoutingKey.substring("metadata.".length()); + } + // periods are not allowed in - replace with %2E + fieldNameForRoutingKey = fieldNameForRoutingKey.replaceAll("\\.", "%2E"); + } + + String defaultSpec = + String.format( + Locale.ROOT, + "expireAfterWrite=%ds", + ConfUtils.getInt(stormConf, "topology.message.timeout.secs", 300)); + + String waitAckSpec = + ConfUtils.getString(stormConf, "opensearch.status.waitack.cache.spec", defaultSpec); + + waitAck = Caffeine.from(waitAckSpec).removalListener(this).build(); + + int metrics_time_bucket_secs = 30; + + // create gauge for waitAck + context.registerMetric("waitAck", () -> waitAck.estimatedSize(), metrics_time_bucket_secs); + + // benchmarking - average number of items received back by Elastic per second + this.receivedPerSecMetrics = + context.registerMetric( + "average_persec", + new MultiReducedMetric(new PerSecondReducer()), + metrics_time_bucket_secs); + + this.eventCounter = + context.registerMetric( + "counters", new MultiCountMetric(), metrics_time_bucket_secs); + + try { + connection = OpenSearchConnection.getConnection(stormConf, OSBoltType, this); + } catch (Exception e1) { + LOG.error("Can't connect to ElasticSearch", e1); + throw new RuntimeException(e1); + } + + // use the default status schema if none has been specified + try { + IndexCreation.checkOrCreateIndex(connection.getClient(), indexName, OSBoltType, LOG); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public void cleanup() { + if (connection == null) { + return; + } + connection.close(); + connection = null; + } + + @Override + public void store( + String url, Status status, Metadata metadata, Optional nextFetch, Tuple tuple) + throws Exception { + + String documentID = getDocumentID(metadata, url); + + boolean isAlreadySentAndDiscovered; + // need to synchronize: otherwise it might get added to the cache + // without having been sent to OpenSearch + waitAckLock.lock(); + try { + // check that the same URL is not being sent to OpenSearch + final var alreadySent = waitAck.getIfPresent(documentID); + isAlreadySentAndDiscovered = status.equals(Status.DISCOVERED) && alreadySent != null; + } finally { + waitAckLock.unlock(); + } + + if (isAlreadySentAndDiscovered) { + // if this object is discovered - adding another version of it + // won't make any difference + LOG.debug( + "Already being sent to OpenSearch {} with status {} and ID {}", + url, + status, + documentID); + // ack straight away! + eventCounter.scope("skipped").incrBy(1); + super.ack(tuple, url); + return; + } + + XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); + builder.field("url", url); + builder.field("status", status); + + builder.startObject("metadata"); + for (String mdKey : metadata.keySet()) { + String[] values = metadata.getValues(mdKey); + // periods are not allowed - replace with %2E + mdKey = mdKey.replaceAll("\\.", "%2E"); + builder.array(mdKey, values); + } + + String partitionKey = partitioner.getPartition(url, metadata); + if (partitionKey == null) { + partitionKey = "_DEFAULT_"; + } + + // store routing key in metadata? + if (StringUtils.isNotBlank(fieldNameForRoutingKey) && routingFieldNameInMetadata) { + builder.field(fieldNameForRoutingKey, partitionKey); + } + + builder.endObject(); + + // store routing key outside metadata? + if (StringUtils.isNotBlank(fieldNameForRoutingKey) && !routingFieldNameInMetadata) { + builder.field(fieldNameForRoutingKey, partitionKey); + } + + if (nextFetch.isPresent()) { + builder.timeField("nextFetchDate", nextFetch.get()); + } + + builder.endObject(); + + IndexRequest request = new IndexRequest(getIndexName(metadata)); + + // check that we don't overwrite an existing entry + // When create is used, the index operation will fail if a document + // by that id already exists in the index. + final boolean create = status.equals(Status.DISCOVERED); + request.source(builder).id(documentID).create(create); + + if (doRouting) { + request.routing(partitionKey); + } + + waitAckLock.lock(); + try { + final List tt = waitAck.get(documentID, k -> new LinkedList<>()); + tt.add(tuple); + LOG.debug("Added to waitAck {} with ID {} total {}", url, documentID, tt.size()); + } finally { + waitAckLock.unlock(); + } + + LOG.debug("Sending to OpenSearch buffer {} with ID {}", url, documentID); + + connection.addToProcessor(request); + } + + @Override + public void onRemoval( + @Nullable String key, @Nullable List value, @NotNull RemovalCause cause) { + if (!cause.wasEvicted()) { + return; + } + LOG.error("Purged from waitAck {} with {} values", key, value.size()); + for (Tuple t : value) { + eventCounter.scope("purged").incrBy(1); + collector.fail(t); + } + } + + @Override + public void afterBulk(long executionId, BulkRequest request, BulkResponse response) { + LOG.debug("afterBulk [{}] with {} responses", executionId, request.numberOfActions()); + eventCounter.scope("bulks_received").incrBy(1); + eventCounter.scope("bulk_msec").incrBy(response.getTook().getMillis()); + eventCounter.scope("received").incrBy(request.numberOfActions()); + receivedPerSecMetrics.scope("received").update(request.numberOfActions()); + + var idsToBulkItemsWithFailedFlag = + Arrays.stream(response.getItems()) + .map( + bir -> { + String id = bir.getId(); + BulkItemResponse.Failure f = bir.getFailure(); + boolean failed = false; + if (f != null) { + // already discovered + if (f.getStatus().equals(RestStatus.CONFLICT)) { + eventCounter.scope("doc_conflicts").incrBy(1); + LOG.debug("Doc conflict ID {}", id); + } else { + LOG.error("Update ID {}, failure: {}", id, f); + failed = true; + } + } + return new BulkItemResponseToFailedFlag(bir, failed); + }) + .collect( + // https://github.com/apache/stormcrawler/issues/832 + Collectors.groupingBy( + idWithFailedFlagTuple -> idWithFailedFlagTuple.id, + Collectors.toUnmodifiableList())); + + Map> presentTuples; + long estimatedSize; + Set debugInfo = null; + waitAckLock.lock(); + try { + presentTuples = waitAck.getAllPresent(idsToBulkItemsWithFailedFlag.keySet()); + if (!presentTuples.isEmpty()) { + waitAck.invalidateAll(presentTuples.keySet()); + } + estimatedSize = waitAck.estimatedSize(); + // Only if we have to. + if (LOG.isDebugEnabled() && estimatedSize > 0L) { + debugInfo = new HashSet<>(waitAck.asMap().keySet()); + } + } finally { + waitAckLock.unlock(); + } + + int ackCount = 0; + int failureCount = 0; + + for (var entry : presentTuples.entrySet()) { + final var id = entry.getKey(); + final var associatedTuple = entry.getValue(); + final var bulkItemsWithFailedFlag = idsToBulkItemsWithFailedFlag.get(id); + + BulkItemResponseToFailedFlag selected; + if (bulkItemsWithFailedFlag.size() == 1) { + selected = bulkItemsWithFailedFlag.get(0); + } else { + // Fallback if there are multiple responses for the same id + BulkItemResponseToFailedFlag tmp = null; + var ctFailed = 0; + for (var buwff : bulkItemsWithFailedFlag) { + if (tmp == null) { + tmp = buwff; + } + if (buwff.failed) { + ctFailed++; + } else { + tmp = buwff; + } + } + if (ctFailed != bulkItemsWithFailedFlag.size()) { + LOG.warn( + "The id {} would result in an ack and a failure. Using only the ack for processing.", + id); + } + selected = Objects.requireNonNull(tmp); + } + + if (associatedTuple != null) { + LOG.debug("Acked {} tuple(s) for ID {}", associatedTuple.size(), id); + for (Tuple tuple : associatedTuple) { + if (!selected.failed) { + String url = tuple.getStringByField("url"); + ackCount++; + // ack and put in cache + LOG.debug("Acked {} with ID {}", url, id); + eventCounter.scope("acked").incrBy(1); + super.ack(tuple, url); + } else { + failureCount++; + eventCounter.scope("failed").incrBy(1); + collector.fail(tuple); + } + } + } else { + LOG.warn("Could not find unacked tuple for {}", id); + } + } + + LOG.info( + "Bulk response [{}] : items {}, waitAck {}, acked {}, failed {}", + executionId, + idsToBulkItemsWithFailedFlag.size(), + estimatedSize, + ackCount, + failureCount); + if (debugInfo != null) { + for (String kinaw : debugInfo) { + LOG.debug("Still in wait ack after bulk response [{}] => {}", executionId, kinaw); + } + } + } + + @Override + public void afterBulk(long executionId, BulkRequest request, Throwable throwable) { + eventCounter.scope("bulks_received").incrBy(1); + eventCounter.scope("received").incrBy(request.numberOfActions()); + receivedPerSecMetrics.scope("received").update(request.numberOfActions()); + LOG.error("Exception with bulk {} - failing the whole lot ", executionId, throwable); + + final var failedIds = + request.requests().stream() + .map(DocWriteRequest::id) + .collect(Collectors.toUnmodifiableSet()); + Map> failedTupleLists; + waitAckLock.lock(); + try { + failedTupleLists = waitAck.getAllPresent(failedIds); + if (!failedTupleLists.isEmpty()) { + waitAck.invalidateAll(failedTupleLists.keySet()); + } + } finally { + waitAckLock.unlock(); + } + + for (var id : failedIds) { + var failedTuples = failedTupleLists.get(id); + if (failedTuples != null) { + LOG.debug("Failed {} tuple(s) for ID {}", failedTuples.size(), id); + for (Tuple x : failedTuples) { + // fail it + eventCounter.scope("failed").incrBy(1); + collector.fail(x); + } + } else { + LOG.warn("Could not find unacked tuple for {}", id); + } + } + } + + @Override + public void beforeBulk(long executionId, BulkRequest request) { + LOG.debug("beforeBulk {} with {} actions", executionId, request.numberOfActions()); + eventCounter.scope("bulks_sent").incrBy(1); + } + + /** + * Must be overridden for implementing custom index names based on some metadata information By + * Default, indexName coming from config is used + */ + protected String getIndexName(Metadata m) { + return indexName; + } +} diff --git a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/AbstractOpenSearchTest.java b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/AbstractOpenSearchTest.java new file mode 100644 index 000000000..e9c72b336 --- /dev/null +++ b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/AbstractOpenSearchTest.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.bolt; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Testcontainers; +import org.testcontainers.utility.DockerImageName; + +@Testcontainers(disabledWithoutDocker = true) +public abstract class AbstractOpenSearchTest { + + private static final String OPENSEARCH_VERSION = "2.19.4"; + + public static final String PASSWORD = "This1sAPassw0rd"; + + protected GenericContainer opensearchContainer = + new GenericContainer<>( + DockerImageName.parse( + "opensearchproject/opensearch:" + OPENSEARCH_VERSION)) + .withExposedPorts(9200) + .withEnv("plugins.security.disabled", "true") + .withEnv("discovery.type", "single-node") + .withEnv("OPENSEARCH_JAVA_OPTS", "-Xms512m -Xmx512m") + .withEnv("OPENSEARCH_INITIAL_ADMIN_PASSWORD", PASSWORD); + + @BeforeEach + void init() { + opensearchContainer.start(); + } + + @AfterEach + void close() { + opensearchContainer.close(); + } +} diff --git a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/IndexerBoltTest.java b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/IndexerBoltTest.java new file mode 100644 index 000000000..aa953a283 --- /dev/null +++ b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/IndexerBoltTest.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.bolt; + +import static org.awaitility.Awaitility.await; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import org.apache.storm.task.OutputCollector; +import org.apache.storm.tuple.Tuple; +import org.apache.stormcrawler.Constants; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.TestOutputCollector; +import org.apache.stormcrawler.TestUtil; +import org.apache.stormcrawler.indexing.AbstractIndexerBolt; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class IndexerBoltTest extends AbstractOpenSearchTest { + + private IndexerBolt bolt; + + protected TestOutputCollector output; + + private static final Logger LOG = LoggerFactory.getLogger(IndexerBoltTest.class); + + private static ExecutorService executorService; + + @BeforeAll + static void beforeClass() { + executorService = Executors.newFixedThreadPool(2); + } + + @AfterAll + static void afterClass() { + executorService.shutdown(); + executorService = null; + } + + @BeforeEach + void setupIndexerBolt() { + bolt = new IndexerBolt("content"); + // give the indexer the port for connecting to OpenSearch + final String host = opensearchContainer.getHost(); + final Integer port = opensearchContainer.getFirstMappedPort(); + final Map conf = new HashMap<>(); + conf.put(AbstractIndexerBolt.urlFieldParamName, "url"); + conf.put(AbstractIndexerBolt.canonicalMetadataParamName, "canonical"); + conf.put("opensearch.indexer.addresses", host + ":" + port); + output = new TestOutputCollector(); + bolt.prepare(conf, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); + } + + @AfterEach + void close() { + LOG.info("Closing indexer bolt and Opensearch container"); + super.close(); + bolt.cleanup(); + output = null; + } + + private void index(String url, String text, Metadata metadata) { + Tuple tuple = mock(Tuple.class); + when(tuple.getStringByField("text")).thenReturn(text); + when(tuple.getStringByField("url")).thenReturn(url); + when(tuple.getValueByField("metadata")).thenReturn(metadata); + bolt.execute(tuple); + } + + private int lastIndex(String url, String text, Metadata metadata, long timeoutInMs) + throws ExecutionException, InterruptedException, TimeoutException { + var oldSize = output.getEmitted(Constants.StatusStreamName).size(); + index(url, text, metadata); + return executorService + .submit( + () -> { + await().atMost(timeoutInMs, TimeUnit.MILLISECONDS) + .until( + () -> + output.getEmitted(Constants.StatusStreamName) + .size() + > oldSize); + return output.getEmitted(Constants.StatusStreamName).size(); + }) + .get(timeoutInMs, TimeUnit.MILLISECONDS); + } + + @Test + @Timeout(value = 2, unit = TimeUnit.MINUTES) + // https://github.com/apache/stormcrawler/issues/832 + void simultaneousCanonicals() + throws ExecutionException, InterruptedException, TimeoutException { + Metadata m1 = new Metadata(); + String url = + "https://www.obozrevatel.com/ukr/dnipro/city/u-dnipri-ta-oblasti-ogolosili-shtormove-poperedzhennya.htm"; + m1.addValue("canonical", url); + Metadata m2 = new Metadata(); + String url2 = + "https://www.obozrevatel.com/ukr/dnipro/city/u-dnipri-ta-oblasti-ogolosili-shtormove-poperedzhennya/amp.htm"; + m2.addValue("canonical", url); + index(url, "", m1); + lastIndex(url2, "", m2, 10_000); + // should be two in status output + assertEquals(2, output.getEmitted(Constants.StatusStreamName).size()); + // and 2 acked + assertEquals(2, output.getAckedTuples().size()); + // TODO check output in Opensearch? + } +} diff --git a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java new file mode 100644 index 000000000..f8440835d --- /dev/null +++ b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.bolt; + +import static org.awaitility.Awaitility.await; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import org.apache.http.HttpHost; +import org.apache.storm.task.OutputCollector; +import org.apache.storm.tuple.Tuple; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.TestOutputCollector; +import org.apache.stormcrawler.TestUtil; +import org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt; +import org.apache.stormcrawler.persistence.Status; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.opensearch.action.get.GetRequest; +import org.opensearch.action.get.GetResponse; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestClient; +import org.opensearch.client.RestClientBuilder; +import org.opensearch.client.RestHighLevelClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class StatusBoltTest extends AbstractOpenSearchTest { + + private StatusUpdaterBolt bolt; + + protected TestOutputCollector output; + + protected org.opensearch.client.RestHighLevelClient client; + + private static final Logger LOG = LoggerFactory.getLogger(StatusBoltTest.class); + + private static ExecutorService executorService; + + @BeforeAll + static void beforeClass() { + executorService = Executors.newFixedThreadPool(2); + } + + @AfterAll + static void afterClass() { + executorService.shutdown(); + executorService = null; + } + + @BeforeEach + void setupStatusBolt() throws IOException { + bolt = new StatusUpdaterBolt(); + RestClientBuilder builder = + RestClient.builder( + new HttpHost( + opensearchContainer.getHost(), + opensearchContainer.getMappedPort(9200))); + client = new RestHighLevelClient(builder); + // configure the status updater bolt + Map conf = new HashMap<>(); + conf.put("opensearch.status.routing.fieldname", "metadata.key"); + conf.put( + "opensearch.status.addresses", + opensearchContainer.getHost() + ":" + opensearchContainer.getFirstMappedPort()); + conf.put("scheduler.class", "org.apache.stormcrawler.persistence.DefaultScheduler"); + conf.put("status.updater.cache.spec", "maximumSize=10000,expireAfterAccess=1h"); + conf.put("metadata.persist", "someKey"); + output = new TestOutputCollector(); + bolt.prepare(conf, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); + } + + @AfterEach + void close() { + LOG.info("Closing updater bolt and Opensearch container"); + super.close(); + bolt.cleanup(); + output = null; + try { + client.close(); + } catch (IOException e) { + } + } + + private Future store(String url, Status status, Metadata metadata) { + Tuple tuple = mock(Tuple.class); + when(tuple.getValueByField("status")).thenReturn(status); + when(tuple.getStringByField("url")).thenReturn(url); + when(tuple.getValueByField("metadata")).thenReturn(metadata); + bolt.execute(tuple); + return executorService.submit( + () -> { + await().atMost(30, TimeUnit.SECONDS) + .until(() -> output.getAckedTuples().size() > 0); + return output.getAckedTuples().size(); + }); + } + + @Test + @Timeout(value = 2, unit = TimeUnit.MINUTES) + // see https://github.com/apache/stormcrawler/issues/885 + void checkListKeyFromOpensearch() + throws IOException, ExecutionException, InterruptedException, TimeoutException { + String url = "https://www.url.net/something"; + Metadata md = new Metadata(); + md.addValue("someKey", "someValue"); + store(url, Status.DISCOVERED, md).get(10, TimeUnit.SECONDS); + assertEquals(1, output.getAckedTuples().size()); + // check output in Opensearch? + String id = org.apache.commons.codec.digest.DigestUtils.sha256Hex(url); + GetResponse result = client.get(new GetRequest("status", id), RequestOptions.DEFAULT); + Map sourceAsMap = result.getSourceAsMap(); + final String pfield = "metadata.somekey"; + sourceAsMap = (Map) sourceAsMap.get("metadata"); + final var pfieldNew = pfield.substring(9); + Object key = sourceAsMap.get(pfieldNew); + assertTrue(key instanceof java.util.ArrayList); + } +} diff --git a/external/opensearch-java/src/test/resources/indexer.mapping b/external/opensearch-java/src/test/resources/indexer.mapping new file mode 100644 index 000000000..fc6eb887f --- /dev/null +++ b/external/opensearch-java/src/test/resources/indexer.mapping @@ -0,0 +1,40 @@ +{ + "settings": { + "index": { + "number_of_shards": 5, + "number_of_replicas": 1, + "refresh_interval": "60s" + } + }, + "mappings": { + "_source": { + "enabled": true + }, + "properties": { + "content": { + "type": "text" + }, + "description": { + "type": "text" + }, + "domain": { + "type": "keyword" + }, + "format": { + "type": "keyword" + }, + "keywords": { + "type": "keyword" + }, + "host": { + "type": "keyword" + }, + "title": { + "type": "text" + }, + "url": { + "type": "keyword" + } + } + } +} diff --git a/external/opensearch-java/src/test/resources/metrics.mapping b/external/opensearch-java/src/test/resources/metrics.mapping new file mode 100644 index 000000000..fc6ae3a09 --- /dev/null +++ b/external/opensearch-java/src/test/resources/metrics.mapping @@ -0,0 +1,40 @@ +{ + "index_patterns": "metrics*", + "settings": { + "index": { + "number_of_shards": 1, + "refresh_interval": "30s" + }, + "number_of_replicas": 0 + }, + "mappings": { + "_source": { "enabled": true }, + "properties": { + "name": { + "type": "keyword" + }, + "stormId": { + "type": "keyword" + }, + "srcComponentId": { + "type": "keyword" + }, + "srcTaskId": { + "type": "short" + }, + "srcWorkerHost": { + "type": "keyword" + }, + "srcWorkerPort": { + "type": "integer" + }, + "timestamp": { + "type": "date", + "format": "date_optional_time" + }, + "value": { + "type": "double" + } + } + } +} diff --git a/external/opensearch-java/src/test/resources/status.mapping b/external/opensearch-java/src/test/resources/status.mapping new file mode 100644 index 000000000..e5b14fe97 --- /dev/null +++ b/external/opensearch-java/src/test/resources/status.mapping @@ -0,0 +1,39 @@ +{ + "settings": { + "index": { + "number_of_shards": 10, + "number_of_replicas": 1, + "refresh_interval": "5s" + } + }, + "mappings": { + "dynamic_templates": [{ + "metadata": { + "path_match": "metadata.*", + "match_mapping_type": "string", + "mapping": { + "type": "keyword" + } + } + }], + "_source": { + "enabled": true + }, + "properties": { + "key": { + "type": "keyword", + "index": true + }, + "nextFetchDate": { + "type": "date", + "format": "date_optional_time" + }, + "status": { + "type": "keyword" + }, + "url": { + "type": "keyword" + } + } + } +} diff --git a/pom.xml b/pom.xml index 4bcea4ad9..ed9c61de5 100644 --- a/pom.xml +++ b/pom.xml @@ -719,6 +719,7 @@ under the License. external/aws external/langid external/opensearch + external/opensearch-java external/playwright external/selenium external/solr @@ -728,6 +729,7 @@ under the License. external/warc archetype external/opensearch/archetype + external/opensearch-java/archetype external/solr/archetype docs From 28bf702fb954e55b7b5f2f78d6e1fb0a5aa072b8 Mon Sep 17 00:00:00 2001 From: Davide Polato Date: Fri, 3 Apr 2026 11:36:35 +0200 Subject: [PATCH 2/4] feat: introduce stormcrawler-opensearch-java module (#1515) Introduces the external/opensearch-java module, replacing the deprecated RestHighLevelClient with the official opensearch-java client. Designed as a drop-in replacement for `external/opensearch` with identical configurations. Key improvements: - Implemented AsyncBulkProcessor (Semaphore + dedicated ThreadPool) to ensure strict backpressure and replace the legacy BulkProcessor. - Fixed historical tuple-ack race conditions in IndexerBolt and DeletionBolt. - Maintained RestClientTransport to seamlessly support the Sniffer and bypass the 100MB response buffer limit. - Synced recent upstream bugfixes, adapting resource cleanup to the new async architecture. --- THIRD-PARTY.txt | 18 + external/opensearch-java/pom.xml | 25 +- .../opensearch/AsyncBulkProcessor.java | 300 +++++++++++++++ .../BulkItemResponseToFailedFlag.java | 91 ++--- .../opensearch/IndexCreation.java | 72 ++-- .../opensearch/OpenSearchConnection.java | 356 ++++++++++++------ .../opensearch/bolt/DeletionBolt.java | 38 +- .../opensearch/bolt/IndexerBolt.java | 121 +++--- .../filtering/JSONURLFilterWrapper.java | 95 +++-- .../opensearch/metrics/MetricsConsumer.java | 33 +- .../opensearch/metrics/StatusMetricsBolt.java | 137 ++++--- .../parse/filter/JSONResourceWrapper.java | 93 +++-- .../opensearch/persistence/AbstractSpout.java | 32 +- .../persistence/AggregationSpout.java | 272 +++++++------ .../opensearch/persistence/HybridSpout.java | 164 ++++---- .../persistence/StatusUpdaterBolt.java | 112 +++--- .../opensearch/AsyncBulkProcessorTest.java | 241 ++++++++++++ .../opensearch/OpenSearchConnectionTest.java | 75 ++++ .../bolt/AbstractOpenSearchTest.java | 3 +- .../opensearch/bolt/StatusBoltTest.java | 35 +- pom.xml | 1 + 21 files changed, 1597 insertions(+), 717 deletions(-) create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/AsyncBulkProcessor.java create mode 100644 external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/AsyncBulkProcessorTest.java create mode 100644 external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/OpenSearchConnectionTest.java diff --git a/THIRD-PARTY.txt b/THIRD-PARTY.txt index 63f63bacb..9517b877e 100644 --- a/THIRD-PARTY.txt +++ b/THIRD-PARTY.txt @@ -30,6 +30,7 @@ List of third-party dependencies grouped by their license type. * Apache Commons IO (commons-io:commons-io:2.21.0 - https://commons.apache.org/proper/commons-io/) * Apache Commons Lang (org.apache.commons:commons-lang3:3.20.0 - https://commons.apache.org/proper/commons-lang/) * Apache Commons Logging (commons-logging:commons-logging:1.2 - http://commons.apache.org/proper/commons-logging/) + * Apache Commons Logging (commons-logging:commons-logging:1.3.3 - https://commons.apache.org/proper/commons-logging/) * Apache Commons Logging (commons-logging:commons-logging:1.3.6 - https://commons.apache.org/proper/commons-logging/) * Apache Commons Math (org.apache.commons:commons-math3:3.6.1 - http://commons.apache.org/proper/commons-math/) * Apache FontBox (org.apache.pdfbox:fontbox:3.0.7 - http://pdfbox.apache.org/) @@ -51,7 +52,10 @@ List of third-party dependencies grouped by their license type. * Apache HBase Unsafe Wrapper (org.apache.hbase.thirdparty:hbase-unsafe:4.1.12 - https://hbase.apache.org/hbase-unsafe) * Apache HttpAsyncClient (org.apache.httpcomponents:httpasyncclient:4.1.5 - http://hc.apache.org/httpcomponents-asyncclient) * Apache HttpClient (org.apache.httpcomponents:httpclient:4.5.14 - http://hc.apache.org/httpcomponents-client-ga) + * Apache HttpClient (org.apache.httpcomponents.client5:httpclient5:5.3.1 - https://hc.apache.org/httpcomponents-client-5.0.x/5.3.1/httpclient5/) * Apache HttpClient Mime (org.apache.httpcomponents:httpmime:4.5.14 - http://hc.apache.org/httpcomponents-client-ga) + * Apache HttpComponents Core HTTP/1.1 (org.apache.httpcomponents.core5:httpcore5:5.2.5 - https://hc.apache.org/httpcomponents-core-5.2.x/5.2.5/httpcore5/) + * Apache HttpComponents Core HTTP/2 (org.apache.httpcomponents.core5:httpcore5-h2:5.2.5 - https://hc.apache.org/httpcomponents-core-5.2.x/5.2.5/httpcore5-h2/) * Apache HttpCore (org.apache.httpcomponents:httpcore:4.4.16 - http://hc.apache.org/httpcomponents-core-ga) * Apache HttpCore NIO (org.apache.httpcomponents:httpcore-nio:4.4.16 - http://hc.apache.org/httpcomponents-core-ga) * Apache James :: Mime4j :: Core (org.apache.james:apache-mime4j-core:0.8.13 - http://james.apache.org/mime4j/apache-mime4j-core) @@ -212,6 +216,7 @@ List of third-party dependencies grouped by their license type. * opensearch-compress (org.opensearch:opensearch-compress:2.19.5 - https://github.com/opensearch-project/OpenSearch.git) * opensearch-core (org.opensearch:opensearch-core:2.19.5 - https://github.com/opensearch-project/OpenSearch.git) * opensearch-geo (org.opensearch:opensearch-geo:2.19.5 - https://github.com/opensearch-project/OpenSearch.git) + * OpenSearch Java Client (org.opensearch.client:opensearch-java:2.13.0 - https://github.com/opensearch-project/opensearch-java/) * opensearch-secure-sm (org.opensearch:opensearch-secure-sm:2.19.5 - https://github.com/opensearch-project/OpenSearch.git) * opensearch-task-commons (org.opensearch:opensearch-task-commons:2.19.5 - https://github.com/opensearch-project/OpenSearch.git) * opensearch-telemetry (org.opensearch:opensearch-telemetry:2.19.5 - https://github.com/opensearch-project/OpenSearch.git) @@ -248,6 +253,7 @@ List of third-party dependencies grouped by their license type. * Playwright - Main Library (com.microsoft.playwright:playwright:1.58.0 - https://github.com/microsoft/playwright-java/playwright) * proto-google-common-protos (com.google.api.grpc:proto-google-common-protos:2.59.2 - https://github.com/googleapis/sdk-platform-java) * rank-eval (org.opensearch.plugin:rank-eval-client:2.19.5 - https://github.com/opensearch-project/OpenSearch.git) + * rest (org.opensearch.client:opensearch-rest-client:2.12.0 - https://github.com/opensearch-project/OpenSearch.git) * rest (org.opensearch.client:opensearch-rest-client:2.19.5 - https://github.com/opensearch-project/OpenSearch.git) * rest-high-level (org.opensearch.client:opensearch-rest-high-level-client:2.19.5 - https://github.com/opensearch-project/OpenSearch.git) * rome (com.rometools:rome:2.1.0 - http://rometools.com/rome) @@ -256,6 +262,7 @@ List of third-party dependencies grouped by their license type. * Shaded Deps for Storm Client (org.apache.storm:storm-shaded-deps:2.8.5 - https://storm.apache.org/storm-shaded-deps) * SnakeYAML (org.yaml:snakeyaml:2.6 - https://bitbucket.org/snakeyaml/snakeyaml) * snappy-java (org.xerial.snappy:snappy-java:1.1.10.4 - https://github.com/xerial/snappy-java) + * sniffer (org.opensearch.client:opensearch-rest-client-sniffer:2.12.0 - https://github.com/opensearch-project/OpenSearch.git) * sniffer (org.opensearch.client:opensearch-rest-client-sniffer:2.19.5 - https://github.com/opensearch-project/OpenSearch.git) * SparseBitSet (com.zaxxer:SparseBitSet:1.3 - https://github.com/brettwooldridge/SparseBitSet) * storm-autocreds (org.apache.storm:storm-autocreds:2.8.5 - https://storm.apache.org/external/storm-autocreds) @@ -344,6 +351,10 @@ List of third-party dependencies grouped by their license type. * JAXB Runtime (org.glassfish.jaxb:jaxb-runtime:4.0.7 - https://eclipse-ee4j.github.io/jaxb-ri/) * TXW2 Runtime (org.glassfish.jaxb:txw2:4.0.7 - https://eclipse-ee4j.github.io/jaxb-ri/) + Eclipse Distribution License v. 1.0, Eclipse Public License v. 2.0 + + * org.eclipse.yasson (org.eclipse:yasson:2.0.2 - https://projects.eclipse.org/projects/ee4j.yasson) + Eclipse Public License, Version 2.0, GPL-2.0-with-classpath-exception * Jakarta RESTful WS API (jakarta.ws.rs:jakarta.ws.rs-api:3.1.0 - https://github.com/eclipse-ee4j/jaxrs-api) @@ -352,6 +363,13 @@ List of third-party dependencies grouped by their license type. * Jakarta Annotations API (jakarta.annotation:jakarta.annotation-api:1.3.5 - https://projects.eclipse.org/projects/ee4j.ca) + Eclipse Public License 2.0, GNU General Public License, version 2 with the GNU Classpath Exception + + * Eclipse Parsson (org.eclipse.parsson:parsson:1.1.6 - https://github.com/eclipse-ee4j/parsson/parsson) + * Jakarta JSON Processing API (jakarta.json:jakarta.json-api:2.1.3 - https://github.com/eclipse-ee4j/jsonp) + * JSON-B API (jakarta.json.bind:jakarta.json.bind-api:2.0.0 - https://eclipse-ee4j.github.io/jsonb-api) + * JSON-P Default Provider (org.glassfish:jakarta.json:2.0.0 - https://github.com/eclipse-ee4j/jsonp) + GENERAL PUBLIC LICENSE, version 3 (GPL-3.0), GNU LESSER GENERAL PUBLIC LICENSE, version 3 (LGPL-3.0), Mozilla Public License Version 1.1 * juniversalchardet (com.github.albfernandez:juniversalchardet:2.5.0 - https://github.com/albfernandez/juniversalchardet) diff --git a/external/opensearch-java/pom.xml b/external/opensearch-java/pom.xml index 376a11486..c7dc1e25d 100644 --- a/external/opensearch-java/pom.xml +++ b/external/opensearch-java/pom.xml @@ -31,6 +31,8 @@ under the License. 2.19.5 + 2.13.0 + 2.12.0 true 0.27 0.27 @@ -45,7 +47,7 @@ under the License. stormcrawler-opensearch-java - https://github.com/apache/stormcrawler/tree/master/external/opensearch + https://github.com/apache/stormcrawler/tree/master/external/opensearch-java OpenSearch module for Apache StormCrawler using the new opensearch-java client @@ -73,22 +75,17 @@ under the License. org.opensearch.client - opensearch-rest-high-level-client - ${opensearch.version} + opensearch-java + ${opensearch.java.version} - - org.awaitility - awaitility - test - - - org.opensearch.client opensearch-rest-client-sniffer - ${opensearch.version} + ${opensearch.restclient.version} @@ -111,6 +108,12 @@ under the License. test + + org.awaitility + awaitility + test + + org.slf4j slf4j-simple diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/AsyncBulkProcessor.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/AsyncBulkProcessor.java new file mode 100644 index 000000000..3f8f22d9a --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/AsyncBulkProcessor.java @@ -0,0 +1,300 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.Semaphore; +import java.util.concurrent.SynchronousQueue; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.ReentrantLock; +import org.opensearch.client.opensearch.OpenSearchClient; +import org.opensearch.client.opensearch.core.BulkRequest; +import org.opensearch.client.opensearch.core.BulkResponse; +import org.opensearch.client.opensearch.core.bulk.BulkOperation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Replacement for the legacy {@code org.opensearch.action.bulk.BulkProcessor} that works with the + * new opensearch-java client. Accumulates {@link BulkOperation} instances and flushes them to + * OpenSearch either when the configured number of actions is reached or when a periodic timer + * fires. + * + *

Concurrency is controlled via a {@link Semaphore}: each in-flight bulk request acquires a + * permit, which provides natural back-pressure towards the Storm topology when the cluster slows + * down. + */ +public final class AsyncBulkProcessor implements AutoCloseable { + + private static final Logger LOG = LoggerFactory.getLogger(AsyncBulkProcessor.class); + + /** Listener interface equivalent to the legacy {@code BulkProcessor.Listener}. */ + public interface Listener { + void beforeBulk(long executionId, BulkRequest request); + + void afterBulk(long executionId, BulkRequest request, BulkResponse response); + + void afterBulk(long executionId, BulkRequest request, Throwable failure); + } + + private final OpenSearchClient client; + private final Listener listener; + private final int bulkActions; + private final int concurrentRequests; + private final Semaphore concurrencyPermits; + private final AtomicLong executionIdGen = new AtomicLong(0); + + private final ReentrantLock lock = new ReentrantLock(); + private List buffer; + + private final ScheduledExecutorService scheduler; + private final ScheduledFuture flushTask; + + /** Dedicated executor for bulk HTTP calls -- avoids starvation of ForkJoinPool.commonPool(). */ + private final ExecutorService bulkExecutor; + + private final AtomicBoolean closed = new AtomicBoolean(false); + + private AsyncBulkProcessor(Builder builder) { + this.client = builder.client; + this.listener = builder.listener; + this.bulkActions = builder.bulkActions; + this.concurrentRequests = builder.concurrentRequests; + this.concurrencyPermits = new Semaphore(this.concurrentRequests); + this.buffer = new ArrayList<>(bulkActions); + + this.bulkExecutor = + new ThreadPoolExecutor( + 1, + this.concurrentRequests, + 60L, + TimeUnit.SECONDS, + new SynchronousQueue<>(), + r -> { + Thread t = new Thread(r, "AsyncBulkProcessor-bulk"); + t.setDaemon(true); + return t; + }, + new ThreadPoolExecutor.CallerRunsPolicy()); + + this.scheduler = + Executors.newSingleThreadScheduledExecutor( + r -> { + Thread t = new Thread(r, "AsyncBulkProcessor-flush"); + t.setDaemon(true); + return t; + }); + this.flushTask = + scheduler.scheduleWithFixedDelay( + this::flushIfNeeded, + builder.flushIntervalMillis, + builder.flushIntervalMillis, + TimeUnit.MILLISECONDS); + } + + /** Adds a single bulk operation. Triggers a flush when {@code bulkActions} is reached. */ + public void add(BulkOperation operation) { + if (closed.get()) { + throw new IllegalStateException("BulkProcessor is closed"); + } + List toFlush = null; + lock.lock(); + try { + buffer.add(operation); + if (buffer.size() >= bulkActions) { + toFlush = swapBuffer(); + } + } finally { + lock.unlock(); + } + if (toFlush != null) { + executeBulk(toFlush); + } + } + + /** Timer-triggered flush: only flushes if the buffer is non-empty. */ + private void flushIfNeeded() { + List toFlush = null; + lock.lock(); + try { + if (!buffer.isEmpty()) { + toFlush = swapBuffer(); + } + } finally { + lock.unlock(); + } + if (toFlush != null) { + executeBulk(toFlush); + } + } + + /** + * Swaps the current buffer with a fresh one and returns the old buffer. Caller must hold {@link + * #lock}. + */ + private List swapBuffer() { + List old = buffer; + buffer = new ArrayList<>(bulkActions); + return old; + } + + /** Builds the request, acquires a concurrency permit, and executes asynchronously. */ + private void executeBulk(List operations) { + final long executionId = executionIdGen.incrementAndGet(); + final BulkRequest request = new BulkRequest.Builder().operations(operations).build(); + + try { + concurrencyPermits.acquire(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + listener.afterBulk(executionId, request, e); + return; + } + + try { + listener.beforeBulk(executionId, request); + } catch (Exception e) { + LOG.warn("beforeBulk callback threw exception", e); + } + + CompletableFuture.supplyAsync( + () -> { + try { + return client.bulk(request); + } catch (Exception e) { + throw new BulkExecutionException(e); + } + }, + bulkExecutor) + .whenComplete( + (response, throwable) -> { + concurrencyPermits.release(); + try { + if (throwable != null) { + Throwable cause = + throwable instanceof BulkExecutionException + ? throwable.getCause() + : throwable; + listener.afterBulk(executionId, request, cause); + } else { + listener.afterBulk(executionId, request, response); + } + } catch (Exception e) { + LOG.warn("afterBulk callback threw exception", e); + } + }); + } + + /** + * Drains pending operations and waits for all in-flight bulk requests to complete, up to the + * given timeout. Equivalent to the legacy {@code BulkProcessor.awaitClose()}. + * + * @return {@code true} if all operations completed within the timeout + */ + public boolean awaitClose(long timeout, TimeUnit unit) throws InterruptedException { + if (!closed.compareAndSet(false, true)) { + return true; + } + + flushTask.cancel(false); + scheduler.shutdown(); + + // Flush any remaining buffered operations + List remaining = null; + lock.lock(); + try { + if (!buffer.isEmpty()) { + remaining = swapBuffer(); + } + } finally { + lock.unlock(); + } + if (remaining != null) { + executeBulk(remaining); + } + + // Wait for all in-flight requests to finish by acquiring all permits + boolean acquired = concurrencyPermits.tryAcquire(concurrentRequests, timeout, unit); + if (acquired) { + concurrencyPermits.release(concurrentRequests); + } + + bulkExecutor.shutdown(); + bulkExecutor.awaitTermination(timeout, unit); + + return acquired; + } + + @Override + public void close() { + try { + awaitClose(60, TimeUnit.SECONDS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + /** Builder for {@link AsyncBulkProcessor}. */ + public static final class Builder { + private final OpenSearchClient client; + private final Listener listener; + private int bulkActions = 50; + private long flushIntervalMillis = 5000; + private int concurrentRequests = 1; + + public Builder(OpenSearchClient client, Listener listener) { + this.client = client; + this.listener = listener; + } + + public Builder setBulkActions(int bulkActions) { + this.bulkActions = bulkActions; + return this; + } + + public Builder setFlushIntervalMillis(long millis) { + this.flushIntervalMillis = millis; + return this; + } + + public Builder setConcurrentRequests(int concurrentRequests) { + this.concurrentRequests = Math.max(1, concurrentRequests); + return this; + } + + public AsyncBulkProcessor build() { + return new AsyncBulkProcessor(this); + } + } + + /** Unchecked wrapper for checked exceptions thrown during bulk execution. */ + private static final class BulkExecutionException extends RuntimeException { + BulkExecutionException(Throwable cause) { + super(cause); + } + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/BulkItemResponseToFailedFlag.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/BulkItemResponseToFailedFlag.java index e4eec09ef..0a064f0e9 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/BulkItemResponseToFailedFlag.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/BulkItemResponseToFailedFlag.java @@ -17,25 +17,41 @@ package org.apache.stormcrawler.opensearch; -import java.io.IOException; +import java.util.Objects; import org.jetbrains.annotations.NotNull; -import org.opensearch.action.DocWriteRequest; -import org.opensearch.action.DocWriteResponse; -import org.opensearch.action.bulk.BulkItemResponse; -import org.opensearch.core.common.io.stream.StreamOutput; -import org.opensearch.core.rest.RestStatus; -import org.opensearch.core.xcontent.ToXContent; -import org.opensearch.core.xcontent.XContentBuilder; +import org.jetbrains.annotations.Nullable; +import org.opensearch.client.opensearch._types.ErrorCause; +import org.opensearch.client.opensearch.core.bulk.BulkResponseItem; public final class BulkItemResponseToFailedFlag { - @NotNull public final BulkItemResponse response; + @NotNull public final BulkResponseItem response; public final boolean failed; @NotNull public final String id; - public BulkItemResponseToFailedFlag(@NotNull BulkItemResponse response, boolean failed) { + public BulkItemResponseToFailedFlag(@NotNull BulkResponseItem response, boolean failed) { this.response = response; this.failed = failed; - this.id = response.getId(); + this.id = Objects.requireNonNull(response.id(), "BulkResponseItem id must not be null"); + } + + /** Returns the error cause, or {@code null} if the item did not fail. */ + @Nullable + public ErrorCause getFailedCause() { + return response.error(); + } + + /** Returns a human-readable failure description, or {@code null} if the item did not fail. */ + @Nullable + public String getFailure() { + ErrorCause error = response.error(); + if (error == null) { + return null; + } + return error.reason() != null ? error.reason() : error.type(); + } + + public Integer getStatus() { + return response.status(); } @Override @@ -78,57 +94,4 @@ public String toString() { + '\'' + '}'; } - - public RestStatus status() { - return response.status(); - } - - public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params params) - throws IOException { - return response.toXContent(builder, params); - } - - public int getItemId() { - return response.getItemId(); - } - - public DocWriteRequest.OpType getOpType() { - return response.getOpType(); - } - - public String getIndex() { - return response.getIndex(); - } - - public long getVersion() { - return response.getVersion(); - } - - public T getResponse() { - return response.getResponse(); - } - - public boolean isFailed() { - return response.isFailed(); - } - - public String getFailureMessage() { - return response.getFailureMessage(); - } - - public BulkItemResponse.Failure getFailure() { - return response.getFailure(); - } - - public void writeTo(StreamOutput out) throws IOException { - response.writeTo(out); - } - - public void writeThin(StreamOutput out) throws IOException { - response.writeThin(out); - } - - public boolean isFragment() { - return response.isFragment(); - } } diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/IndexCreation.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/IndexCreation.java index 180a10743..ed44644c1 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/IndexCreation.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/IndexCreation.java @@ -17,29 +17,24 @@ package org.apache.stormcrawler.opensearch; -import com.google.common.base.Charsets; import com.google.common.io.Resources; import java.io.IOException; import java.net.URL; -import org.opensearch.OpenSearchException; -import org.opensearch.action.support.master.AcknowledgedResponse; -import org.opensearch.client.RequestOptions; -import org.opensearch.client.RestHighLevelClient; -import org.opensearch.client.indices.CreateIndexRequest; -import org.opensearch.client.indices.CreateIndexResponse; -import org.opensearch.client.indices.GetIndexRequest; -import org.opensearch.client.indices.IndexTemplatesExistRequest; -import org.opensearch.client.indices.PutIndexTemplateRequest; -import org.opensearch.common.xcontent.XContentType; +import java.nio.charset.StandardCharsets; +import org.opensearch.client.Request; +import org.opensearch.client.Response; +import org.opensearch.client.RestClient; +import org.opensearch.client.opensearch.OpenSearchClient; +import org.opensearch.client.opensearch.indices.ExistsTemplateRequest; +import org.opensearch.client.transport.rest_client.RestClientTransport; import org.slf4j.Logger; public class IndexCreation { public static synchronized void checkOrCreateIndex( - RestHighLevelClient client, String indexName, String boltType, Logger log) + OpenSearchClient client, String indexName, String boltType, Logger log) throws IOException { - final boolean indexExists = - client.indices().exists(new GetIndexRequest(indexName), RequestOptions.DEFAULT); + final boolean indexExists = client.indices().exists(req -> req.index(indexName)).value(); log.info("Index '{}' exists? {}", indexName, indexExists); // there's a possible check-then-update race condition // createIndex intentionally catches and logs exceptions from OpenSearch @@ -51,13 +46,12 @@ public static synchronized void checkOrCreateIndex( } public static synchronized void checkOrCreateIndexTemplate( - RestHighLevelClient client, String boltType, Logger log) throws IOException { + OpenSearchClient client, String boltType, Logger log) throws IOException { final String templateName = boltType + "-template"; final boolean templateExists = client.indices() - .existsTemplate( - new IndexTemplatesExistRequest(templateName), - RequestOptions.DEFAULT); + .existsTemplate(ExistsTemplateRequest.of(r -> r.name(templateName))) + .value(); log.info("Template '{}' exists? {}", templateName, templateExists); // there's a possible check-then-update race condition // createTemplate intentionally catches and logs exceptions from OpenSearch @@ -69,46 +63,48 @@ public static synchronized void checkOrCreateIndexTemplate( } private static boolean createTemplate( - RestHighLevelClient client, String templateName, String resourceName, Logger log) { + OpenSearchClient client, String templateName, String resourceName, Logger log) { try { - final PutIndexTemplateRequest createIndexRequest = - new PutIndexTemplateRequest(templateName); - final URL mapping = Thread.currentThread().getContextClassLoader().getResource(resourceName); - final String jsonIndexConfiguration = Resources.toString(mapping, Charsets.UTF_8); + final String jsonIndexConfiguration = + Resources.toString(mapping, StandardCharsets.UTF_8); - createIndexRequest.source(jsonIndexConfiguration, XContentType.JSON); + // Extract the low-level REST client to bypass typed builder limitations for raw JSON + RestClient restClient = ((RestClientTransport) client._transport()).restClient(); + Request request = new Request("PUT", "/_template/" + templateName); + request.setJsonEntity(jsonIndexConfiguration); - final AcknowledgedResponse createIndexResponse = - client.indices().putTemplate(createIndexRequest, RequestOptions.DEFAULT); - return createIndexResponse.isAcknowledged(); - } catch (IOException | OpenSearchException e) { + Response response = restClient.performRequest(request); + int statusCode = response.getStatusLine().getStatusCode(); + return statusCode == 200 || statusCode == 201; + } catch (Exception e) { log.warn("template '{}' not created", templateName, e); return false; } } private static boolean createIndex( - RestHighLevelClient client, String indexName, String resourceName, Logger log) { + OpenSearchClient client, String indexName, String resourceName, Logger log) { try { - - final CreateIndexRequest createIndexRequest = new CreateIndexRequest(indexName); - final URL mapping = Thread.currentThread().getContextClassLoader().getResource(resourceName); - final String jsonIndexConfiguration = Resources.toString(mapping, Charsets.UTF_8); + final String jsonIndexConfiguration = + Resources.toString(mapping, StandardCharsets.UTF_8); - createIndexRequest.source(jsonIndexConfiguration, XContentType.JSON); + // Extract the low-level REST client to bypass typed builder limitations for raw JSON + RestClient restClient = ((RestClientTransport) client._transport()).restClient(); + Request request = new Request("PUT", "/" + indexName); + request.setJsonEntity(jsonIndexConfiguration); - final CreateIndexResponse createIndexResponse = - client.indices().create(createIndexRequest, RequestOptions.DEFAULT); - return createIndexResponse.isAcknowledged(); - } catch (IOException | OpenSearchException e) { + Response response = restClient.performRequest(request); + int statusCode = response.getStatusLine().getStatusCode(); + return statusCode == 200 || statusCode == 201; + } catch (Exception e) { log.warn("index '{}' not created", indexName, e); return false; } diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java index c3662a098..deb96c841 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java @@ -27,6 +27,7 @@ import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpHost; import org.apache.http.auth.AuthScope; @@ -39,18 +40,17 @@ import org.apache.stormcrawler.util.ConfUtils; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; -import org.opensearch.action.DocWriteRequest; -import org.opensearch.action.bulk.BulkProcessor; -import org.opensearch.action.bulk.BulkRequest; -import org.opensearch.action.bulk.BulkResponse; import org.opensearch.client.HttpAsyncResponseConsumerFactory; import org.opensearch.client.Node; import org.opensearch.client.RequestOptions; import org.opensearch.client.RestClient; import org.opensearch.client.RestClientBuilder; -import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.json.jackson.JacksonJsonpMapper; +import org.opensearch.client.opensearch.OpenSearchClient; +import org.opensearch.client.opensearch.core.bulk.BulkOperation; import org.opensearch.client.sniff.Sniffer; -import org.opensearch.common.unit.TimeValue; +import org.opensearch.client.transport.rest_client.RestClientOptions; +import org.opensearch.client.transport.rest_client.RestClientTransport; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -61,24 +61,188 @@ public final class OpenSearchConnection { private static final Logger LOG = LoggerFactory.getLogger(OpenSearchConnection.class); - @NotNull private final RestHighLevelClient client; + @NotNull private final OpenSearchClient client; - @NotNull private final BulkProcessor processor; + @NotNull private final AsyncBulkProcessor processor; @Nullable private final Sniffer sniffer; + @NotNull private final RestClient restClient; + private OpenSearchConnection( - @NotNull RestHighLevelClient c, @NotNull BulkProcessor p, @Nullable Sniffer s) { - processor = p; + @NotNull OpenSearchClient c, + @NotNull AsyncBulkProcessor p, + @Nullable Sniffer s, + @NotNull RestClient rc) { client = c; + processor = p; sniffer = s; + restClient = rc; } - public RestHighLevelClient getClient() { + public OpenSearchClient getClient() { return client; } - public static RestHighLevelClient getClient(Map stormConf, String boltType) { + /** + * Creates a standalone {@link OpenSearchClient}. Used by classes that need a client without a + * bulk processor (e.g. spouts, filters). Callers are responsible for closing the returned + * client's transport via {@code client._transport().close()}. + */ + public static OpenSearchClient getClient(Map stormConf, String boltType) { + return buildClientResources(stormConf, boltType, 100).client(); + } + + /** Adds a single bulk operation to the internal processor. */ + public void addToProcessor(final BulkOperation operation) { + processor.add(operation); + } + + /** + * Creates a connection with a default (no-op) listener. The values for bolt type are + * [indexer,status,metrics]. + */ + public static OpenSearchConnection getConnection( + Map stormConf, String boltType) { + AsyncBulkProcessor.Listener listener = + new AsyncBulkProcessor.Listener() { + @Override + public void afterBulk( + long arg0, + org.opensearch.client.opensearch.core.BulkRequest arg1, + org.opensearch.client.opensearch.core.BulkResponse arg2) {} + + @Override + public void afterBulk( + long arg0, + org.opensearch.client.opensearch.core.BulkRequest arg1, + Throwable arg2) {} + + @Override + public void beforeBulk( + long arg0, org.opensearch.client.opensearch.core.BulkRequest arg1) {} + }; + return getConnection(stormConf, boltType, listener); + } + + public static OpenSearchConnection getConnection( + Map stormConf, String boltType, AsyncBulkProcessor.Listener listener) { + + final String dottedType = boltType + "."; + + final int bufferSize = + ConfUtils.getInt( + stormConf, Constants.PARAMPREFIX, dottedType, "responseBufferSize", 100); + + ClientResources cr = buildClientResources(stormConf, boltType, bufferSize); + + final String flushIntervalString = + ConfUtils.getString( + stormConf, Constants.PARAMPREFIX, dottedType, "flushInterval", "5s"); + + final long flushIntervalMillis = parseTimeValueToMillis(flushIntervalString, 5000); + + final int bulkActions = + ConfUtils.getInt(stormConf, Constants.PARAMPREFIX, dottedType, "bulkActions", 50); + + final int concurrentRequests = + ConfUtils.getInt( + stormConf, Constants.PARAMPREFIX, dottedType, "concurrentRequests", 1); + + AsyncBulkProcessor bulkProcessor = null; + Sniffer sniffer = null; + try { + bulkProcessor = + new AsyncBulkProcessor.Builder(cr.client(), listener) + .setBulkActions(bulkActions) + .setFlushIntervalMillis(flushIntervalMillis) + .setConcurrentRequests(concurrentRequests) + .build(); + + boolean sniff = + ConfUtils.getBoolean( + stormConf, Constants.PARAMPREFIX, dottedType, "sniff", true); + if (sniff) { + sniffer = Sniffer.builder(cr.restClient()).build(); + } + + return new OpenSearchConnection(cr.client(), bulkProcessor, sniffer, cr.restClient()); + } catch (Exception e) { + if (bulkProcessor != null) { + try { + bulkProcessor.close(); + } catch (Exception suppressed) { + e.addSuppressed(suppressed); + } + } + try { + cr.restClient().close(); + } catch (IOException suppressed) { + e.addSuppressed(suppressed); + } + throw e; + } + } + + private final AtomicBoolean isClosed = new AtomicBoolean(false); + + public void close() { + + if (!isClosed.compareAndSet(false, true)) { + LOG.warn("Tried to close an already closed connection!"); + return; + } + + LOG.debug("Start closing the OpenSearch connection"); + + // First, close the BulkProcessor ensuring pending actions are flushed + try { + boolean success = processor.awaitClose(60, TimeUnit.SECONDS); + if (!success) { + throw new RuntimeException( + "Failed to flush pending actions when closing BulkProcessor"); + } + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + + if (sniffer != null) { + sniffer.close(); + } + + // Now close the REST client (also closes the transport) + try { + restClient.close(); + } catch (IOException e) { + LOG.trace("Client threw IO exception."); + } + } + + /** + * Extracts the document ID from a {@link BulkOperation} regardless of its type (index, create, + * delete, update). + */ + public static String getBulkOperationId(BulkOperation op) { + if (op.isIndex()) { + return op.index().id(); + } + if (op.isCreate()) { + return op.create().id(); + } + if (op.isDelete()) { + return op.delete().id(); + } + if (op.isUpdate()) { + return op.update().id(); + } + return null; + } + + // internal helpers + private record ClientResources(OpenSearchClient client, RestClient restClient) {} + + private static ClientResources buildClientResources( + Map stormConf, String boltType, int responseBufferSizeMB) { final String dottedType = boltType + "."; @@ -227,123 +391,73 @@ public static RestHighLevelClient getClient(Map stormConf, Strin builder.setCompressionEnabled(compression); - return new RestHighLevelClient(builder); - } - - public void addToProcessor(final DocWriteRequest request) { - processor.add(request); - } - - /** - * Creates a connection with a default listener. The values for bolt type are - * [indexer,status,metrics] - */ - public static OpenSearchConnection getConnection( - Map stormConf, String boltType) { - BulkProcessor.Listener listener = - new BulkProcessor.Listener() { - @Override - public void afterBulk(long arg0, BulkRequest arg1, BulkResponse arg2) {} - - @Override - public void afterBulk(long arg0, BulkRequest arg1, Throwable arg2) {} - - @Override - public void beforeBulk(long arg0, BulkRequest arg1) {} - }; - return getConnection(stormConf, boltType, listener); - } - - public static OpenSearchConnection getConnection( - Map stormConf, String boltType, BulkProcessor.Listener listener) { - - final RestHighLevelClient client = getClient(stormConf, boltType); - - final String dottedType = boltType + "."; - - final String flushIntervalString = - ConfUtils.getString( - stormConf, Constants.PARAMPREFIX, dottedType, "flushInterval", "5s"); - - final TimeValue flushInterval = - TimeValue.parseTimeValue( - flushIntervalString, TimeValue.timeValueSeconds(5), "flushInterval"); - - final int bulkActions = - ConfUtils.getInt(stormConf, Constants.PARAMPREFIX, dottedType, "bulkActions", 50); - - final int concurrentRequests = - ConfUtils.getInt( - stormConf, Constants.PARAMPREFIX, dottedType, "concurrentRequests", 1); - - final RequestOptions requestOptions = RequestOptions.DEFAULT; - final RequestOptions.Builder requestOptionsBuilder = requestOptions.toBuilder(); - final int bufferSize = - ConfUtils.getInt( - stormConf, Constants.PARAMPREFIX, dottedType, "responseBufferSize", 100); - - requestOptionsBuilder.setHttpAsyncResponseConsumerFactory( - new HttpAsyncResponseConsumerFactory.HeapBufferedResponseConsumerFactory( - bufferSize * 1024 * 1024)); - - final BulkProcessor bulkProcessor = - BulkProcessor.builder( - (request, bulkListener) -> - client.bulkAsync( - request, - requestOptionsBuilder.build(), - bulkListener), - listener) - .setFlushInterval(flushInterval) - .setBulkActions(bulkActions) - .setConcurrentRequests(concurrentRequests) - .build(); - - boolean sniff = - ConfUtils.getBoolean(stormConf, Constants.PARAMPREFIX, dottedType, "sniff", true); - Sniffer sniffer = null; - if (sniff) { - sniffer = Sniffer.builder(client.getLowLevelClient()).build(); + final RestClient restClient = builder.build(); + + // --- Response buffer size configuration --- + // The default HeapBufferedResponseConsumerFactory in the low-level REST client has + // a hardcoded limit of 100 MB. Large MSearch or aggregation responses can exceed + // this, causing ContentTooLongException. + // + // This fix works because we use RestClientTransport, which passes RequestOptions + // (including HttpAsyncResponseConsumerFactory) directly to the low-level RestClient. + // + // NOTE: if StormCrawler ever switches to ApacheHttpClient5Transport, this approach + // will silently stop working. In that case, use: + // ApacheHttpClient5Options.DEFAULT.toBuilder() + // .setHttpAsyncResponseConsumerFactory(factory).build() + // See: https://github.com/opensearch-project/opensearch-java/issues/1370 + final int DEFAULT_RESPONSE_BUFFER_SIZE_MB = 100; + final int effectiveBufferSizeMB; + if (responseBufferSizeMB <= 0) { + LOG.warn( + "Invalid responseBufferSize {}MB for {}, falling back to default {}MB", + responseBufferSizeMB, + boltType, + DEFAULT_RESPONSE_BUFFER_SIZE_MB); + effectiveBufferSizeMB = DEFAULT_RESPONSE_BUFFER_SIZE_MB; + } else { + effectiveBufferSizeMB = responseBufferSizeMB; } + LOG.info("OpenSearch response buffer size for {}: {}MB", boltType, effectiveBufferSizeMB); - return new OpenSearchConnection(client, bulkProcessor, sniffer); - } + final RequestOptions.Builder optionsBuilder = RequestOptions.DEFAULT.toBuilder(); + optionsBuilder.setHttpAsyncResponseConsumerFactory( + new HttpAsyncResponseConsumerFactory.HeapBufferedResponseConsumerFactory( + effectiveBufferSizeMB * 1024 * 1024)); + final RestClientOptions transportOptions = new RestClientOptions(optionsBuilder.build()); - private boolean isClosed = false; + final RestClientTransport transport = + new RestClientTransport(restClient, new JacksonJsonpMapper(), transportOptions); + final OpenSearchClient openSearchClient = new OpenSearchClient(transport); - public void close() { + return new ClientResources(openSearchClient, restClient); + } - if (isClosed) { - LOG.warn("Tried to close an already closed connection!"); - return; + /** + * Parses a time value string (e.g. "5s", "500ms", "1m") into milliseconds. + * + * @param value the string to parse + * @param defaultMillis the default if parsing fails + * @return milliseconds + */ + static long parseTimeValueToMillis(String value, long defaultMillis) { + if (value == null || value.isBlank()) { + return defaultMillis; } - - // Maybe some kind of identifier? - LOG.debug("Start closing the OpenSearch connection"); - - // First, close the BulkProcessor ensuring pending actions are flushed + value = value.trim(); try { - boolean success = processor.awaitClose(60, TimeUnit.SECONDS); - if (!success) { - throw new RuntimeException( - "Failed to flush pending actions when closing BulkProcessor"); + if (value.endsWith("ms")) { + return Long.parseLong(value.substring(0, value.length() - 2)); + } else if (value.endsWith("s")) { + return Long.parseLong(value.substring(0, value.length() - 1)) * 1000; + } else if (value.endsWith("m")) { + return Long.parseLong(value.substring(0, value.length() - 1)) * 60000; + } else { + return Long.parseLong(value); } - } catch (InterruptedException e) { - throw new RuntimeException(e); + } catch (NumberFormatException e) { + LOG.warn("Could not parse time value '{}', using default {}ms", value, defaultMillis); + return defaultMillis; } - - if (sniffer != null) { - sniffer.close(); - } - - // Now close the actual client - try { - client.close(); - } catch (IOException e) { - // ignore silently - LOG.trace("Client threw IO exception."); - } - - isClosed = true; } } diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java index c67b90951..779c23c89 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java @@ -22,7 +22,6 @@ import com.github.benmanes.caffeine.cache.RemovalCause; import com.github.benmanes.caffeine.cache.RemovalListener; import java.lang.invoke.MethodHandles; -import java.util.Arrays; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -36,18 +35,15 @@ import org.apache.storm.topology.base.BaseRichBolt; import org.apache.storm.tuple.Tuple; import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.opensearch.AsyncBulkProcessor; import org.apache.stormcrawler.opensearch.BulkItemResponseToFailedFlag; import org.apache.stormcrawler.opensearch.OpenSearchConnection; import org.apache.stormcrawler.util.ConfUtils; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; -import org.opensearch.action.DocWriteRequest; -import org.opensearch.action.bulk.BulkItemResponse; -import org.opensearch.action.bulk.BulkProcessor.Listener; -import org.opensearch.action.bulk.BulkRequest; -import org.opensearch.action.bulk.BulkResponse; -import org.opensearch.action.delete.DeleteRequest; -import org.opensearch.core.rest.RestStatus; +import org.opensearch.client.opensearch.core.BulkRequest; +import org.opensearch.client.opensearch.core.BulkResponse; +import org.opensearch.client.opensearch.core.bulk.BulkOperation; import org.slf4j.LoggerFactory; /** @@ -57,7 +53,7 @@ * delete documents which were indexed under the canonical URL. */ public class DeletionBolt extends BaseRichBolt - implements RemovalListener>, Listener { + implements RemovalListener>, AsyncBulkProcessor.Listener { static final org.slf4j.Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -106,6 +102,7 @@ public void prepare( context.registerMetric("waitAck", () -> waitAck.estimatedSize(), 10); } + @Override public void onRemoval( @Nullable String key, @Nullable List value, @NotNull RemovalCause cause) { if (!cause.wasEvicted()) { @@ -138,8 +135,8 @@ public void execute(Tuple tuple) { // used final String docID = getDocumentID(metadata, url); - DeleteRequest dr = new DeleteRequest(getIndexName(metadata), docID); - connection.addToProcessor(dr); + final String targetIndex = getIndexName(metadata); + BulkOperation op = BulkOperation.of(b -> b.delete(d -> d.index(targetIndex).id(docID))); waitAckLock.lock(); try { @@ -153,6 +150,8 @@ public void execute(Tuple tuple) { } finally { waitAckLock.unlock(); } + + connection.addToProcessor(op); } @Override @@ -185,14 +184,14 @@ public void beforeBulk(long executionId, BulkRequest request) {} @Override public void afterBulk(long executionId, BulkRequest request, BulkResponse response) { var idsToBulkItemsWithFailedFlag = - Arrays.stream(response.getItems()) + response.items().stream() .map( bir -> { - String id = bir.getId(); - BulkItemResponse.Failure f = bir.getFailure(); + String id = bir.id(); + var error = bir.error(); boolean failed = false; - if (f != null) { - if (f.getStatus().equals(RestStatus.CONFLICT)) { + if (error != null) { + if (bir.status() == 409) { LOG.debug("Doc conflict ID {}", id); } else { failed = true; @@ -257,8 +256,6 @@ public void afterBulk(long executionId, BulkRequest request, BulkResponse respon for (Tuple t : associatedTuple) { String url = (String) t.getValueByField("url"); - Metadata metadata = (Metadata) t.getValueByField("metadata"); - if (!selected.failed) { ackCount++; _collector.ack(t); @@ -288,8 +285,9 @@ public void afterBulk(long executionId, BulkRequest request, Throwable failure) LOG.error("Exception with bulk {} - failing the whole lot ", executionId, failure); final var failedIds = - request.requests().stream() - .map(DocWriteRequest::id) + request.operations().stream() + .map(OpenSearchConnection::getBulkOperationId) + .filter(Objects::nonNull) .collect(Collectors.toUnmodifiableSet()); Map> failedTupleLists; waitAckLock.lock(); diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java index 04de31cae..ce77c07d6 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java @@ -18,14 +18,13 @@ package org.apache.stormcrawler.opensearch.bolt; import static org.apache.stormcrawler.Constants.StatusStreamName; -import static org.opensearch.common.xcontent.XContentFactory.jsonBuilder; import com.github.benmanes.caffeine.cache.Cache; import com.github.benmanes.caffeine.cache.Caffeine; import com.github.benmanes.caffeine.cache.RemovalCause; import com.github.benmanes.caffeine.cache.RemovalListener; import java.io.IOException; -import java.util.Arrays; +import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; @@ -46,6 +45,7 @@ import org.apache.stormcrawler.Constants; import org.apache.stormcrawler.Metadata; import org.apache.stormcrawler.indexing.AbstractIndexerBolt; +import org.apache.stormcrawler.opensearch.AsyncBulkProcessor; import org.apache.stormcrawler.opensearch.BulkItemResponseToFailedFlag; import org.apache.stormcrawler.opensearch.IndexCreation; import org.apache.stormcrawler.opensearch.OpenSearchConnection; @@ -54,14 +54,9 @@ import org.apache.stormcrawler.util.PerSecondReducer; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; -import org.opensearch.action.DocWriteRequest; -import org.opensearch.action.bulk.BulkItemResponse; -import org.opensearch.action.bulk.BulkProcessor; -import org.opensearch.action.bulk.BulkRequest; -import org.opensearch.action.bulk.BulkResponse; -import org.opensearch.action.index.IndexRequest; -import org.opensearch.core.rest.RestStatus; -import org.opensearch.core.xcontent.XContentBuilder; +import org.opensearch.client.opensearch.core.BulkRequest; +import org.opensearch.client.opensearch.core.BulkResponse; +import org.opensearch.client.opensearch.core.bulk.BulkOperation; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -70,7 +65,7 @@ * <String,Object> from a named field. */ public class IndexerBolt extends AbstractIndexerBolt - implements RemovalListener>, BulkProcessor.Listener { + implements RemovalListener>, AsyncBulkProcessor.Listener { private static final Logger LOG = LoggerFactory.getLogger(IndexerBolt.class); @@ -203,19 +198,19 @@ public void execute(Tuple tuple) { final String docID = getDocumentID(metadata, normalisedurl); try { - final XContentBuilder builder = jsonBuilder().startObject(); + final Map source = new HashMap<>(); // display text of the document? if (StringUtils.isNotBlank(fieldNameForText())) { final String text = trimText(tuple.getStringByField("text")); if (!ignoreEmptyFields() || StringUtils.isNotBlank(text)) { - builder.field(fieldNameForText(), trimText(text)); + source.put(fieldNameForText(), trimText(text)); } } // send URL as field? if (StringUtils.isNotBlank(fieldNameForURL())) { - builder.field(fieldNameForURL(), normalisedurl); + source.put(fieldNameForURL(), normalisedurl); } // which metadata to display? @@ -225,30 +220,43 @@ public void execute(Tuple tuple) { if (entry.getValue().length == 1) { final String value = entry.getValue()[0]; if (!ignoreEmptyFields() || StringUtils.isNotBlank(value)) { - builder.field(entry.getKey(), value); + source.put(entry.getKey(), value); } } else if (entry.getValue().length > 1) { - builder.array(entry.getKey(), entry.getValue()); + source.put(entry.getKey(), List.of(entry.getValue())); } } - builder.endObject(); - - final IndexRequest indexRequest = - new IndexRequest(getIndexName(metadata)) - .source(builder) - .id(docID) - .create(create); - - if (pipeline != null) { - indexRequest.setPipeline(pipeline); + final String targetIndex = getIndexName(metadata); + final BulkOperation op; + if (create) { + op = + BulkOperation.of( + b -> + b.create( + c -> { + c.index(targetIndex).id(docID).document(source); + if (pipeline != null) { + c.pipeline(pipeline); + } + return c; + })); + } else { + op = + BulkOperation.of( + b -> + b.index( + idx -> { + idx.index(targetIndex) + .id(docID) + .document(source); + if (pipeline != null) { + idx.pipeline(pipeline); + } + return idx; + })); } - connection.addToProcessor(indexRequest); - - eventCounter.scope("Indexed").incrBy(1); - perSecMetrics.scope("Indexed").update(1); - waitAckLock.lock(); try { List tt = waitAck.getIfPresent(docID); @@ -261,7 +269,12 @@ public void execute(Tuple tuple) { } finally { waitAckLock.unlock(); } - } catch (IOException e) { + + connection.addToProcessor(op); + + eventCounter.scope("Indexed").incrBy(1); + perSecMetrics.scope("Indexed").update(1); + } catch (Exception e) { LOG.error("Error building document for OpenSearch", e); // do not send to status stream so that it gets replayed _collector.fail(tuple); @@ -291,17 +304,17 @@ public void beforeBulk(long executionId, BulkRequest request) { @Override public void afterBulk(long executionId, BulkRequest request, BulkResponse response) { eventCounter.scope("bulks_received").incrBy(1); - eventCounter.scope("bulk_msec").incrBy(response.getTook().getMillis()); + eventCounter.scope("bulk_msec").incrBy(response.took()); var idsToBulkItemsWithFailedFlag = - Arrays.stream(response.getItems()) + response.items().stream() .map( bir -> { - String id = bir.getId(); - BulkItemResponse.Failure f = bir.getFailure(); + String id = bir.id(); + var error = bir.error(); boolean failed = false; - if (f != null) { - if (f.getStatus().equals(RestStatus.CONFLICT)) { + if (error != null) { + if (bir.status() == 409) { eventCounter.scope("doc_conflicts").incrBy(1); LOG.debug("Doc conflict ID {}", id); } else { @@ -385,9 +398,8 @@ public void afterBulk(long executionId, BulkRequest request, BulkResponse respon var failure = selected.getFailure(); LOG.error("update ID {}, URL {}, failure: {}", id, url, failure); // there is something wrong with the content we should - // treat - // it as an ERROR - if (selected.getFailure().getStatus().equals(RestStatus.BAD_REQUEST)) { + // treat it as an ERROR + if (selected.getStatus() == 400) { metadata.setValue(Constants.STATUS_ERROR_SOURCE, "OpenSearch indexing"); metadata.setValue(Constants.STATUS_ERROR_MESSAGE, "invalid content"); _collector.emit( @@ -395,25 +407,9 @@ public void afterBulk(long executionId, BulkRequest request, BulkResponse respon _collector.ack(t); LOG.debug("Acked {} with ID {}", url, id); } else { - LOG.error("update ID {}, URL {}, failure: {}", id, url, failure); - // there is something wrong with the content we - // should - // treat - // it as an ERROR - if (failure.getStatus().equals(RestStatus.BAD_REQUEST)) { - metadata.setValue( - Constants.STATUS_ERROR_SOURCE, "OpenSearch indexing"); - metadata.setValue( - Constants.STATUS_ERROR_MESSAGE, "invalid content"); - _collector.emit( - StatusStreamName, - t, - new Values(url, metadata, Status.ERROR)); - _collector.ack(t); - } else { - // otherwise just fail it - _collector.fail(t); - } + // otherwise just fail it + _collector.fail(t); + LOG.debug("Failed {} with ID {}", url, id); } } } @@ -442,8 +438,9 @@ public void afterBulk(long executionId, BulkRequest request, Throwable failure) LOG.error("Exception with bulk {} - failing the whole lot ", executionId, failure); final var failedIds = - request.requests().stream() - .map(DocWriteRequest::id) + request.operations().stream() + .map(OpenSearchConnection::getBulkOperationId) + .filter(Objects::nonNull) .collect(Collectors.toUnmodifiableSet()); Map> failedTupleLists; waitAckLock.lock(); diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/filtering/JSONURLFilterWrapper.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/filtering/JSONURLFilterWrapper.java index 900223fa0..d983bb0cc 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/filtering/JSONURLFilterWrapper.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/filtering/JSONURLFilterWrapper.java @@ -19,7 +19,9 @@ import com.fasterxml.jackson.databind.JsonNode; import java.io.ByteArrayInputStream; +import java.io.IOException; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.util.Map; import java.util.Timer; import java.util.TimerTask; @@ -29,10 +31,9 @@ import org.apache.stormcrawler.opensearch.OpenSearchConnection; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; -import org.opensearch.action.get.GetRequest; -import org.opensearch.action.get.GetResponse; -import org.opensearch.client.RequestOptions; -import org.opensearch.client.RestHighLevelClient; +import org.opensearch.client.json.JsonData; +import org.opensearch.client.opensearch.OpenSearchClient; +import org.opensearch.client.opensearch.core.GetResponse; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,8 +47,8 @@ * *

  *  {
- *     "class": "org.apache.stormcrawler.elasticsearch.filtering.JSONURLFilterWrapper",
- *     "name": "ESFastURLFilter",
+ *     "class": "org.apache.stormcrawler.opensearch.filtering.JSONURLFilterWrapper",
+ *     "name": "OSFastURLFilter",
  *     "params": {
  *         "refresh": "60",
  *         "delegate": {
@@ -71,6 +72,8 @@ public class JSONURLFilterWrapper extends URLFilter {
     private static final Logger LOG = LoggerFactory.getLogger(JSONURLFilterWrapper.class);
 
     private URLFilter delegatedURLFilter;
+    private Timer refreshTimer;
+    private OpenSearchClient osClient;
 
     public void configure(@NotNull Map stormConf, @NotNull JsonNode filterParams) {
 
@@ -127,42 +130,40 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode
 
         final JSONResource resource = (JSONResource) delegatedURLFilter;
 
-        new Timer()
-                .schedule(
-                        new TimerTask() {
-                            private RestHighLevelClient osClient;
-
-                            public void run() {
-                                if (osClient == null) {
-                                    try {
-                                        osClient =
-                                                OpenSearchConnection.getClient(stormConf, "config");
-                                    } catch (Exception e) {
-                                        LOG.error(
-                                                "Exception while creating OpenSearch connection",
-                                                e);
-                                    }
-                                }
-                                if (osClient != null) {
-                                    LOG.info("Reloading json resources from OpenSearch");
-                                    try {
-                                        GetResponse response =
-                                                osClient.get(
-                                                        new GetRequest(
-                                                                "config",
-                                                                resource.getResourceFile()),
-                                                        RequestOptions.DEFAULT);
-                                        resource.loadJSONResources(
-                                                new ByteArrayInputStream(
-                                                        response.getSourceAsBytes()));
-                                    } catch (Exception e) {
-                                        LOG.error("Can't load config from OpenSearch", e);
-                                    }
+        refreshTimer = new Timer();
+        refreshTimer.schedule(
+                new TimerTask() {
+                    public void run() {
+                        if (osClient == null) {
+                            try {
+                                osClient = OpenSearchConnection.getClient(stormConf, "config");
+                            } catch (Exception e) {
+                                LOG.error("Exception while creating OpenSearch connection", e);
+                            }
+                        }
+                        if (osClient != null) {
+                            LOG.info("Reloading json resources from OpenSearch");
+                            try {
+                                GetResponse response =
+                                        osClient.get(
+                                                g ->
+                                                        g.index("config")
+                                                                .id(resource.getResourceFile()),
+                                                JsonData.class);
+                                if (response.found() && response.source() != null) {
+                                    String json = response.source().toJson().toString();
+                                    resource.loadJSONResources(
+                                            new ByteArrayInputStream(
+                                                    json.getBytes(StandardCharsets.UTF_8)));
                                 }
+                            } catch (Exception e) {
+                                LOG.error("Can't load config from OpenSearch", e);
                             }
-                        },
-                        0,
-                        refreshRate * 1000);
+                        }
+                    }
+                },
+                0,
+                refreshRate * 1000);
     }
 
     @Override
@@ -172,4 +173,18 @@ public void run() {
             @NotNull String urlToFilter) {
         return delegatedURLFilter.filter(sourceUrl, sourceMetadata, urlToFilter);
     }
+
+    @Override
+    public void cleanup() {
+        if (refreshTimer != null) {
+            refreshTimer.cancel();
+        }
+        if (osClient != null) {
+            try {
+                osClient._transport().close();
+            } catch (IOException e) {
+                LOG.error("Exception when closing OpenSearch client", e);
+            }
+        }
+    }
 }
diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java
index 6b9ccf4cb..c46a2b734 100644
--- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java
+++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java
@@ -17,12 +17,11 @@
 
 package org.apache.stormcrawler.opensearch.metrics;
 
-import static org.opensearch.common.xcontent.XContentFactory.jsonBuilder;
-
 import java.io.IOException;
 import java.text.SimpleDateFormat;
 import java.util.Collection;
 import java.util.Date;
+import java.util.HashMap;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Map.Entry;
@@ -32,8 +31,7 @@
 import org.apache.stormcrawler.opensearch.IndexCreation;
 import org.apache.stormcrawler.opensearch.OpenSearchConnection;
 import org.apache.stormcrawler.util.ConfUtils;
-import org.opensearch.action.index.IndexRequest;
-import org.opensearch.core.xcontent.XContentBuilder;
+import org.opensearch.client.opensearch.core.bulk.BulkOperation;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -144,19 +142,20 @@ private String getIndexName(Date timestamp) {
 
     private void indexDataPoint(TaskInfo taskInfo, Date timestamp, String name, double value) {
         try {
-            XContentBuilder builder = jsonBuilder().startObject();
-            builder.field("stormId", stormID);
-            builder.field("srcComponentId", taskInfo.srcComponentId);
-            builder.field("srcTaskId", taskInfo.srcTaskId);
-            builder.field("srcWorkerHost", taskInfo.srcWorkerHost);
-            builder.field("srcWorkerPort", taskInfo.srcWorkerPort);
-            builder.field("name", name);
-            builder.field("value", value);
-            builder.field("timestamp", timestamp);
-            builder.endObject();
-
-            IndexRequest indexRequest = new IndexRequest(getIndexName(timestamp)).source(builder);
-            connection.addToProcessor(indexRequest);
+            Map doc = new HashMap<>();
+            doc.put("stormId", stormID);
+            doc.put("srcComponentId", taskInfo.srcComponentId);
+            doc.put("srcTaskId", taskInfo.srcTaskId);
+            doc.put("srcWorkerHost", taskInfo.srcWorkerHost);
+            doc.put("srcWorkerPort", taskInfo.srcWorkerPort);
+            doc.put("name", name);
+            doc.put("value", value);
+            doc.put("timestamp", timestamp.toInstant().toString());
+
+            final String targetIndex = getIndexName(timestamp);
+            BulkOperation op =
+                    BulkOperation.of(b -> b.index(idx -> idx.index(targetIndex).document(doc)));
+            connection.addToProcessor(op);
         } catch (Exception e) {
             LOG.error("problem when building request for OpenSearch", e);
         }
diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java
index 56edf6967..697dd17a6 100644
--- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java
+++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java
@@ -17,8 +17,11 @@
 
 package org.apache.stormcrawler.opensearch.metrics;
 
-import java.util.HashMap;
 import java.util.Map;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CompletionException;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicBoolean;
 import org.apache.storm.Config;
 import org.apache.storm.task.OutputCollector;
 import org.apache.storm.task.TopologyContext;
@@ -29,12 +32,8 @@
 import org.apache.stormcrawler.opensearch.Constants;
 import org.apache.stormcrawler.opensearch.OpenSearchConnection;
 import org.apache.stormcrawler.util.ConfUtils;
-import org.opensearch.client.RequestOptions;
-import org.opensearch.client.core.CountRequest;
-import org.opensearch.client.core.CountResponse;
-import org.opensearch.core.action.ActionListener;
-import org.opensearch.index.query.QueryBuilders;
-import org.opensearch.search.builder.SearchSourceBuilder;
+import org.opensearch.client.opensearch.OpenSearchClient;
+import org.opensearch.client.opensearch._types.FieldValue;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -52,45 +51,22 @@ public class StatusMetricsBolt extends BaseRichBolt {
 
     private String indexName;
 
-    private OpenSearchConnection connection;
+    private OpenSearchClient client;
 
-    private Map latestStatusCounts = new HashMap<>(6);
+    private Map latestStatusCounts = new ConcurrentHashMap<>(6);
 
     private int freqStats = 60;
 
     private OutputCollector _collector;
 
-    private transient StatusActionListener[] listeners;
+    private transient StatusCounter[] counters;
 
-    private class StatusActionListener implements ActionListener {
+    private static final class StatusCounter {
+        final String name;
+        final AtomicBoolean ready = new AtomicBoolean(true);
 
-        private final String name;
-
-        private boolean ready = true;
-
-        public boolean isReady() {
-            return ready;
-        }
-
-        public void busy() {
-            this.ready = false;
-        }
-
-        StatusActionListener(String statusName) {
-            name = statusName;
-        }
-
-        @Override
-        public void onResponse(CountResponse response) {
-            ready = true;
-            LOG.debug("Got {} counts for status:{}", response.getCount(), name);
-            latestStatusCounts.put(name, response.getCount());
-        }
-
-        @Override
-        public void onFailure(Exception e) {
-            ready = true;
-            LOG.error("Failure when getting counts for status:{}", name, e);
+        StatusCounter(String name) {
+            this.name = name;
         }
     }
 
@@ -100,9 +76,9 @@ public void prepare(
         _collector = collector;
         indexName = ConfUtils.getString(stormConf, OSStatusIndexNameParamName, "status");
         try {
-            connection = OpenSearchConnection.getConnection(stormConf, OSBoltType);
+            client = OpenSearchConnection.getClient(stormConf, OSBoltType);
         } catch (Exception e1) {
-            LOG.error("Can't connect to ElasticSearch", e1);
+            LOG.error("Can't connect to OpenSearch", e1);
             throw new RuntimeException(e1);
         }
 
@@ -113,14 +89,14 @@ public void prepare(
                 },
                 freqStats);
 
-        listeners = new StatusActionListener[6];
+        counters = new StatusCounter[6];
 
-        listeners[0] = new StatusActionListener("DISCOVERED");
-        listeners[1] = new StatusActionListener("FETCHED");
-        listeners[2] = new StatusActionListener("FETCH_ERROR");
-        listeners[3] = new StatusActionListener("REDIRECTION");
-        listeners[4] = new StatusActionListener("ERROR");
-        listeners[5] = new StatusActionListener("TOTAL");
+        counters[0] = new StatusCounter("DISCOVERED");
+        counters[1] = new StatusCounter("FETCHED");
+        counters[2] = new StatusCounter("FETCH_ERROR");
+        counters[3] = new StatusCounter("REDIRECTION");
+        counters[4] = new StatusCounter("ERROR");
+        counters[5] = new StatusCounter("TOTAL");
     }
 
     @Override
@@ -140,26 +116,69 @@ public void execute(Tuple input) {
             return;
         }
 
-        for (StatusActionListener listener : listeners) {
+        for (StatusCounter counter : counters) {
             // still waiting for results from previous request
-            if (!listener.isReady()) {
-                LOG.debug("Not ready to get counts for status {}", listener.name);
+            if (!counter.ready.compareAndSet(true, false)) {
+                LOG.debug("Not ready to get counts for status {}", counter.name);
                 continue;
             }
-            CountRequest request = new CountRequest(indexName);
-            if (!listener.name.equalsIgnoreCase("TOTAL")) {
-                SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
-                sourceBuilder.query(QueryBuilders.termQuery("status", listener.name));
-                request.source(sourceBuilder);
-            }
-            listener.busy();
-            connection.getClient().countAsync(request, RequestOptions.DEFAULT, listener);
+            final String statusName = counter.name;
+            CompletableFuture.supplyAsync(
+                            () -> {
+                                try {
+                                    if (statusName.equalsIgnoreCase("TOTAL")) {
+                                        return client.count(c -> c.index(indexName));
+                                    } else {
+                                        return client.count(
+                                                c ->
+                                                        c.index(indexName)
+                                                                .query(
+                                                                        q ->
+                                                                                q.term(
+                                                                                        t ->
+                                                                                                t.field(
+                                                                                                                "status")
+                                                                                                        .value(
+                                                                                                                FieldValue
+                                                                                                                        .of(
+                                                                                                                                statusName)))));
+                                    }
+                                } catch (Exception e) {
+                                    throw new CompletionException(e);
+                                }
+                            })
+                    .thenAccept(
+                            response -> {
+                                counter.ready.set(true);
+                                LOG.debug(
+                                        "Got {} counts for status:{}",
+                                        response.count(),
+                                        statusName);
+                                latestStatusCounts.put(statusName, response.count());
+                            })
+                    .exceptionally(
+                            e -> {
+                                counter.ready.set(true);
+                                Throwable cause =
+                                        e instanceof CompletionException ? e.getCause() : e;
+                                LOG.error(
+                                        "Failure when getting counts for status:{}",
+                                        statusName,
+                                        cause);
+                                return null;
+                            });
         }
     }
 
     @Override
     public void cleanup() {
-        connection.close();
+        if (client != null) {
+            try {
+                client._transport().close();
+            } catch (Exception e) {
+                LOG.error("Exception closing client transport", e);
+            }
+        }
     }
 
     @Override
diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java
index e475afb2e..b96563e86 100644
--- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java
+++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java
@@ -19,6 +19,8 @@
 
 import com.fasterxml.jackson.databind.JsonNode;
 import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.Map;
 import java.util.Timer;
 import java.util.TimerTask;
@@ -27,10 +29,9 @@
 import org.apache.stormcrawler.parse.ParseFilter;
 import org.apache.stormcrawler.parse.ParseResult;
 import org.jetbrains.annotations.NotNull;
-import org.opensearch.action.get.GetRequest;
-import org.opensearch.action.get.GetResponse;
-import org.opensearch.client.RequestOptions;
-import org.opensearch.client.RestHighLevelClient;
+import org.opensearch.client.json.JsonData;
+import org.opensearch.client.opensearch.OpenSearchClient;
+import org.opensearch.client.opensearch.core.GetResponse;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;
@@ -45,7 +46,7 @@
  *
  * 
  *  {
- *     "class": "org.apache.stormcrawler.elasticsearch.parse.filter.JSONResourceWrapper",
+ *     "class": "org.apache.stormcrawler.opensearch.parse.filter.JSONResourceWrapper",
  *     "name": "OpenSearchCollectionTagger",
  *     "params": {
  *         "refresh": "60",
@@ -70,6 +71,8 @@ public class JSONResourceWrapper extends ParseFilter {
     private static final Logger LOG = LoggerFactory.getLogger(JSONResourceWrapper.class);
 
     private ParseFilter delegatedParseFilter;
+    private Timer refreshTimer;
+    private OpenSearchClient osClient;
 
     public void configure(@NotNull Map stormConf, @NotNull JsonNode filterParams) {
 
@@ -126,46 +129,58 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode
 
         final JSONResource resource = (JSONResource) delegatedParseFilter;
 
-        new Timer()
-                .schedule(
-                        new TimerTask() {
-                            private RestHighLevelClient esClient;
-
-                            public void run() {
-                                if (esClient == null) {
-                                    try {
-                                        esClient =
-                                                OpenSearchConnection.getClient(stormConf, "config");
-                                    } catch (Exception e) {
-                                        LOG.error(
-                                                "Exception while creating OpenSearch connection",
-                                                e);
-                                    }
-                                }
-                                if (esClient != null) {
-                                    LOG.info("Reloading json resources from OpenSearch");
-                                    try {
-                                        GetResponse response =
-                                                esClient.get(
-                                                        new GetRequest(
-                                                                "config",
-                                                                resource.getResourceFile()),
-                                                        RequestOptions.DEFAULT);
-                                        resource.loadJSONResources(
-                                                new ByteArrayInputStream(
-                                                        response.getSourceAsBytes()));
-                                    } catch (Exception e) {
-                                        LOG.error("Can't load config from OpenSearch", e);
-                                    }
+        refreshTimer = new Timer();
+        refreshTimer.schedule(
+                new TimerTask() {
+                    public void run() {
+                        if (osClient == null) {
+                            try {
+                                osClient = OpenSearchConnection.getClient(stormConf, "config");
+                            } catch (Exception e) {
+                                LOG.error("Exception while creating OpenSearch connection", e);
+                            }
+                        }
+                        if (osClient != null) {
+                            LOG.info("Reloading json resources from OpenSearch");
+                            try {
+                                GetResponse response =
+                                        osClient.get(
+                                                g ->
+                                                        g.index("config")
+                                                                .id(resource.getResourceFile()),
+                                                JsonData.class);
+                                if (response.found() && response.source() != null) {
+                                    String json = response.source().toJson().toString();
+                                    resource.loadJSONResources(
+                                            new ByteArrayInputStream(
+                                                    json.getBytes(StandardCharsets.UTF_8)));
                                 }
+                            } catch (Exception e) {
+                                LOG.error("Can't load config from OpenSearch", e);
                             }
-                        },
-                        0,
-                        refreshRate * 1000);
+                        }
+                    }
+                },
+                0,
+                refreshRate * 1000);
     }
 
     @Override
     public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {
         delegatedParseFilter.filter(URL, content, doc, parse);
     }
+
+    @Override
+    public void cleanup() {
+        if (refreshTimer != null) {
+            refreshTimer.cancel();
+        }
+        if (osClient != null) {
+            try {
+                osClient._transport().close();
+            } catch (IOException e) {
+                LOG.error("Exception when closing OpenSearch client", e);
+            }
+        }
+    }
 }
diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AbstractSpout.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AbstractSpout.java
index 43b0e4289..6cd315d38 100644
--- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AbstractSpout.java
+++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AbstractSpout.java
@@ -31,8 +31,7 @@
 import org.apache.stormcrawler.opensearch.OpenSearchConnection;
 import org.apache.stormcrawler.persistence.AbstractQueryingSpout;
 import org.apache.stormcrawler.util.ConfUtils;
-import org.opensearch.client.RestHighLevelClient;
-import org.opensearch.search.SearchHit;
+import org.opensearch.client.opensearch.OpenSearchClient;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -72,7 +71,7 @@ public abstract class AbstractSpout extends AbstractQueryingSpout {
 
     protected String indexName;
 
-    protected static RestHighLevelClient client;
+    protected static OpenSearchClient client;
 
     /**
      * when using multiple instances - each one is in charge of a specific shard useful when
@@ -180,14 +179,20 @@ public void open(
     /** Builds a query and use it retrieve the results from OS * */
     protected abstract void populateBuffer();
 
-    protected final boolean addHitToBuffer(SearchHit hit) {
-        Map keyValues = hit.getSourceAsMap();
-        String url = (String) keyValues.get("url");
+    /**
+     * Adds a document source to the URL buffer unless it is already being processed.
+     *
+     * @param source the document source as a key-value map (must contain a "url" entry)
+     * @return {@code true} if the URL was added to the buffer, {@code false} if it was already
+     *     being processed or already present
+     */
+    protected final boolean addHitToBuffer(Map source) {
+        String url = (String) source.get("url");
         // is already being processed - skip it!
         if (beingProcessed.containsKey(url)) {
             return false;
         }
-        return buffer.add(url, fromKeyValues(keyValues));
+        return buffer.add(url, fromKeyValues(source));
     }
 
     protected final Metadata fromKeyValues(Map keyValues) {
@@ -225,11 +230,14 @@ public void fail(Object msgId) {
 
     @Override
     public void close() {
-        if (client != null) {
-            try {
-                client.close();
-            } catch (IOException e) {
-                LOG.error("Exception caught when closing client", e);
+        synchronized (AbstractSpout.class) {
+            if (client != null) {
+                try {
+                    client._transport().close();
+                } catch (IOException e) {
+                    LOG.error("Exception caught when closing client", e);
+                }
+                client = null;
             }
         }
     }
diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java
index 2eb97102f..62bc6faeb 100644
--- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java
+++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java
@@ -17,45 +17,37 @@
 
 package org.apache.stormcrawler.opensearch.persistence;
 
-import static org.opensearch.index.query.QueryBuilders.boolQuery;
-
+import java.io.IOException;
 import java.time.Instant;
+import java.time.ZoneOffset;
+import java.time.format.DateTimeFormatter;
 import java.util.Calendar;
+import java.util.Collections;
 import java.util.Date;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.TimeZone;
-import java.util.concurrent.TimeUnit;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CompletionException;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.storm.spout.SpoutOutputCollector;
 import org.apache.storm.task.TopologyContext;
 import org.apache.stormcrawler.Metadata;
 import org.apache.stormcrawler.opensearch.Constants;
 import org.apache.stormcrawler.util.ConfUtils;
-import org.joda.time.format.ISODateTimeFormat;
-import org.opensearch.action.search.SearchRequest;
-import org.opensearch.action.search.SearchResponse;
-import org.opensearch.client.RequestOptions;
-import org.opensearch.core.action.ActionListener;
-import org.opensearch.index.query.BoolQueryBuilder;
-import org.opensearch.index.query.QueryBuilders;
-import org.opensearch.search.SearchHit;
-import org.opensearch.search.aggregations.AggregationBuilders;
-import org.opensearch.search.aggregations.Aggregations;
-import org.opensearch.search.aggregations.BucketOrder;
-import org.opensearch.search.aggregations.bucket.SingleBucketAggregation;
-import org.opensearch.search.aggregations.bucket.sampler.DiversifiedAggregationBuilder;
-import org.opensearch.search.aggregations.bucket.terms.Terms;
-import org.opensearch.search.aggregations.bucket.terms.Terms.Bucket;
-import org.opensearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
-import org.opensearch.search.aggregations.metrics.TopHits;
-import org.opensearch.search.builder.SearchSourceBuilder;
-import org.opensearch.search.sort.FieldSortBuilder;
-import org.opensearch.search.sort.SortBuilders;
-import org.opensearch.search.sort.SortOrder;
+import org.opensearch.client.json.JsonData;
+import org.opensearch.client.opensearch._types.SortOrder;
+import org.opensearch.client.opensearch._types.aggregations.Aggregate;
+import org.opensearch.client.opensearch._types.aggregations.Aggregation;
+import org.opensearch.client.opensearch._types.aggregations.StringTermsBucket;
+import org.opensearch.client.opensearch._types.aggregations.TopHitsAggregate;
+import org.opensearch.client.opensearch.core.SearchRequest;
+import org.opensearch.client.opensearch.core.SearchResponse;
+import org.opensearch.client.opensearch.core.search.Hit;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -65,7 +57,7 @@
  * the same number of spout instances as OpenSearch shards. Guarantees a good mix of URLs by
  * aggregating them by an arbitrary field e.g. key.
  */
-public class AggregationSpout extends AbstractSpout implements ActionListener {
+public class AggregationSpout extends AbstractSpout {
 
     private static final Logger LOG = LoggerFactory.getLogger(AggregationSpout.class);
 
@@ -104,106 +96,170 @@ protected void populateBuffer() {
             lastTimeResetToNow = Instant.now();
         }
 
-        String formattedQueryDate = ISODateTimeFormat.dateTimeNoMillis().print(queryDate.getTime());
+        String formattedQueryDate =
+                Instant.ofEpochMilli(queryDate.getTime())
+                        .atOffset(ZoneOffset.UTC)
+                        .format(DateTimeFormatter.ISO_OFFSET_DATE_TIME);
 
         LOG.info("{} Populating buffer with nextFetchDate <= {}", logIdprefix, formattedQueryDate);
 
-        BoolQueryBuilder queryBuilder =
-                boolQuery()
-                        .filter(QueryBuilders.rangeQuery("nextFetchDate").lte(formattedQueryDate));
-
-        if (filterQueries != null) {
-            for (String filterQuery : filterQueries) {
-                queryBuilder.filter(QueryBuilders.queryStringQuery(filterQuery));
-            }
+        // Build the top_hits sub-aggregation
+        Aggregation topHitsAgg =
+                Aggregation.of(
+                        a ->
+                                a.topHits(
+                                        th -> {
+                                            th.size(maxURLsPerBucket).explain(false);
+                                            for (String bsf : bucketSortField) {
+                                                th.sort(
+                                                        s ->
+                                                                s.field(
+                                                                        fs ->
+                                                                                fs.field(bsf)
+                                                                                        .order(
+                                                                                                SortOrder
+                                                                                                        .Asc)));
+                                            }
+                                            return th;
+                                        }));
+
+        // Build the terms (partition) aggregation with top_hits sub-agg
+        Aggregation.Builder.ContainerBuilder partitionAggBuilder =
+                new Aggregation.Builder()
+                        .terms(
+                                t -> {
+                                    t.field(partitionField).size(maxBucketNum);
+                                    // sort between buckets by the min sub-aggregation
+                                    if (StringUtils.isNotBlank(totalSortField)) {
+                                        t.order(
+                                                Collections.singletonList(
+                                                        Collections.singletonMap(
+                                                                "top_hit", SortOrder.Asc)));
+                                    }
+                                    return t;
+                                })
+                        .aggregations("docs", topHitsAgg);
+
+        // add the min sub-aggregation used for sorting between buckets
+        if (StringUtils.isNotBlank(totalSortField)) {
+            partitionAggBuilder.aggregations(
+                    "top_hit", Aggregation.of(minAgg -> minAgg.min(m -> m.field(totalSortField))));
         }
 
-        SearchRequest request = new SearchRequest(indexName);
-
-        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
-        sourceBuilder.query(queryBuilder);
-        sourceBuilder.from(0);
-        sourceBuilder.size(0);
-        sourceBuilder.explain(false);
-        sourceBuilder.trackTotalHits(false);
+        Aggregation partitionAgg = partitionAggBuilder.build();
+
+        // Build the search request
+        SearchRequest.Builder requestBuilder =
+                new SearchRequest.Builder()
+                        .index(indexName)
+                        .size(0)
+                        .trackTotalHits(t -> t.enabled(false))
+                        .query(
+                                q ->
+                                        q.bool(
+                                                b -> {
+                                                    b.filter(
+                                                            f ->
+                                                                    f.range(
+                                                                            r ->
+                                                                                    r.field(
+                                                                                                    "nextFetchDate")
+                                                                                            .lte(
+                                                                                                    JsonData
+                                                                                                            .of(
+                                                                                                                    formattedQueryDate))));
+                                                    if (filterQueries != null) {
+                                                        for (String fq : filterQueries) {
+                                                            b.filter(
+                                                                    f ->
+                                                                            f.queryString(
+                                                                                    qs ->
+                                                                                            qs
+                                                                                                    .query(
+                                                                                                            fq)));
+                                                        }
+                                                    }
+                                                    return b;
+                                                }));
 
         if (queryTimeout != -1) {
-            sourceBuilder.timeout(
-                    new org.opensearch.common.unit.TimeValue(queryTimeout, TimeUnit.SECONDS));
-        }
-
-        TermsAggregationBuilder aggregations =
-                AggregationBuilders.terms("partition").field(partitionField).size(maxBucketNum);
-
-        org.opensearch.search.aggregations.metrics.TopHitsAggregationBuilder tophits =
-                AggregationBuilders.topHits("docs").size(maxURLsPerBucket).explain(false);
-
-        // sort within a bucket
-        for (String bsf : bucketSortField) {
-            FieldSortBuilder sorter = SortBuilders.fieldSort(bsf).order(SortOrder.ASC);
-            tophits.sort(sorter);
-        }
-
-        aggregations.subAggregation(tophits);
-
-        // sort between buckets
-        if (StringUtils.isNotBlank(totalSortField)) {
-            org.opensearch.search.aggregations.metrics.MinAggregationBuilder minBuilder =
-                    AggregationBuilders.min("top_hit").field(totalSortField);
-            aggregations.subAggregation(minBuilder);
-            aggregations.order(BucketOrder.aggregation("top_hit", true));
+            requestBuilder.timeout(queryTimeout + "s");
         }
 
         if (sample) {
-            DiversifiedAggregationBuilder sab = new DiversifiedAggregationBuilder("sample");
-            sab.field(partitionField).maxDocsPerValue(maxURLsPerBucket);
-            sab.shardSize(maxURLsPerBucket * maxBucketNum);
-            sab.subAggregation(aggregations);
-            sourceBuilder.aggregation(sab);
+            // Wrap in a diversified sampler aggregation
+            requestBuilder.aggregations(
+                    "sample",
+                    Aggregation.of(
+                            a ->
+                                    a.diversifiedSampler(
+                                                    ds ->
+                                                            ds.field(partitionField)
+                                                                    .maxDocsPerValue(
+                                                                            maxURLsPerBucket)
+                                                                    .shardSize(
+                                                                            maxURLsPerBucket
+                                                                                    * maxBucketNum))
+                                            .aggregations("partition", partitionAgg)));
         } else {
-            sourceBuilder.aggregation(aggregations);
+            requestBuilder.aggregations("partition", partitionAgg);
         }
 
-        request.source(sourceBuilder);
-
-        // https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-preference.html
-        // _shards:2,3
-        // specific shard but ideally a local copy of it
+        // shard preference for routing
         if (shardID != -1) {
-            request.preference("_shards:" + shardID + "|_local");
+            requestBuilder.preference("_shards:" + shardID + "|_local");
         }
 
+        SearchRequest request = requestBuilder.build();
+
         // dump query to log
         LOG.debug("{} OpenSearch query {}", logIdprefix, request);
 
-        LOG.trace("{} isInquery set to true");
+        LOG.trace("{} isInquery set to true", logIdprefix);
         isInQuery.set(true);
-        client.searchAsync(request, RequestOptions.DEFAULT, this);
-    }
 
-    @Override
-    public void onFailure(Exception arg0) {
-        LOG.error("{} Exception with OpenSearch query", logIdprefix, arg0);
-        markQueryReceivedNow();
+        CompletableFuture.supplyAsync(
+                        () -> {
+                            try {
+                                return client.search(request, JsonData.class);
+                            } catch (IOException e) {
+                                throw new CompletionException(e);
+                            }
+                        })
+                .thenAccept(this::handleResponse)
+                .exceptionally(
+                        e -> {
+                            Throwable cause = e instanceof CompletionException ? e.getCause() : e;
+                            LOG.error("{} Exception with OpenSearch query", logIdprefix, cause);
+                            markQueryReceivedNow();
+                            return null;
+                        });
     }
 
-    @Override
-    public void onResponse(SearchResponse response) {
+    /**
+     * Handles the search response from an asynchronous aggregation query, extracting URLs from term
+     * buckets and adding them to the buffer.
+     *
+     * @param response the search response containing aggregation results
+     */
+    protected void handleResponse(SearchResponse response) {
         long timeTaken = System.currentTimeMillis() - getTimeLastQuerySent();
 
-        Aggregations aggregs = response.getAggregations();
+        Map aggregs = response.aggregations();
 
-        if (aggregs == null) {
+        if (aggregs == null || aggregs.isEmpty()) {
             markQueryReceivedNow();
             return;
         }
 
-        SingleBucketAggregation sample = aggregs.get("sample");
-        if (sample != null) {
-            aggregs = sample.getAggregations();
+        // Unwrap the sample aggregation if present
+        Aggregate sampleAgg = aggregs.get("sample");
+        if (sampleAgg != null) {
+            aggregs = sampleAgg.sampler().aggregations();
         }
 
-        Terms agg = aggregs.get("partition");
+        Aggregate partitionAgg = aggregs.get("partition");
+        List buckets = partitionAgg.sterms().buckets().array();
 
         int numhits = 0;
         int numBuckets = 0;
@@ -214,35 +270,33 @@ public void onResponse(SearchResponse response) {
         currentBuckets.clear();
 
         // For each entry
-        Iterator iterator = (Iterator) agg.getBuckets().iterator();
+        Iterator iterator = buckets.iterator();
         while (iterator.hasNext()) {
-            Terms.Bucket entry = iterator.next();
-            String key = (String) entry.getKey(); // bucket key
+            StringTermsBucket entry = iterator.next();
+            String key = entry.key(); // bucket key
 
             currentBuckets.add(key);
 
-            long docCount = entry.getDocCount(); // Doc count
+            long docCount = entry.docCount(); // Doc count
 
             int hitsForThisBucket = 0;
 
-            SearchHit lastHit = null;
+            List lastSortValues = null;
 
             // filter results so that we don't include URLs we are already
             // being processed
-            TopHits topHits = entry.getAggregations().get("docs");
-            for (SearchHit hit : topHits.getHits().getHits()) {
+            TopHitsAggregate topHits = entry.aggregations().get("docs").topHits();
+            for (Hit hit : topHits.hits().hits()) {
 
-                LOG.debug(
-                        "{} -> id [{}], _source [{}]",
-                        logIdprefix,
-                        hit.getId(),
-                        hit.getSourceAsString());
+                @SuppressWarnings("unchecked")
+                Map keyValues = (Map) hit.source().to(Object.class);
+
+                LOG.debug("{} -> id [{}], _source [{}]", logIdprefix, hit.id(), keyValues);
 
                 hitsForThisBucket++;
 
-                lastHit = hit;
+                lastSortValues = hit.sort();
 
-                Map keyValues = hit.getSourceAsMap();
                 String url = (String) keyValues.get("url");
 
                 // consider only the first document of the last bucket
@@ -273,8 +327,8 @@ public void onResponse(SearchResponse response) {
                 LOG.debug("{} -> added to buffer : {}", logIdprefix, url);
             }
 
-            if (lastHit != null) {
-                sortValuesForKey(key, lastHit.getSortValues());
+            if (lastSortValues != null && !lastSortValues.isEmpty()) {
+                sortValuesForKey(key, lastSortValues.toArray());
             }
 
             if (hitsForThisBucket > 0) {
diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/HybridSpout.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/HybridSpout.java
index 551153f52..fd600f0af 100644
--- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/HybridSpout.java
+++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/HybridSpout.java
@@ -17,31 +17,28 @@
 
 package org.apache.stormcrawler.opensearch.persistence;
 
-import static org.opensearch.index.query.QueryBuilders.boolQuery;
-
 import com.github.benmanes.caffeine.cache.Cache;
 import com.github.benmanes.caffeine.cache.Caffeine;
+import java.io.IOException;
 import java.time.Instant;
+import java.time.ZoneOffset;
+import java.time.format.DateTimeFormatter;
 import java.util.Date;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CompletionException;
 import org.apache.storm.spout.SpoutOutputCollector;
 import org.apache.storm.task.TopologyContext;
 import org.apache.stormcrawler.opensearch.Constants;
 import org.apache.stormcrawler.persistence.EmptyQueueListener;
 import org.apache.stormcrawler.util.ConfUtils;
-import org.joda.time.format.ISODateTimeFormat;
-import org.opensearch.action.search.SearchRequest;
-import org.opensearch.action.search.SearchResponse;
-import org.opensearch.client.RequestOptions;
-import org.opensearch.core.action.ActionListener;
-import org.opensearch.index.query.BoolQueryBuilder;
-import org.opensearch.index.query.QueryBuilders;
-import org.opensearch.search.SearchHit;
-import org.opensearch.search.builder.SearchSourceBuilder;
-import org.opensearch.search.sort.FieldSortBuilder;
-import org.opensearch.search.sort.SortBuilders;
-import org.opensearch.search.sort.SortOrder;
+import org.opensearch.client.json.JsonData;
+import org.opensearch.client.opensearch._types.FieldValue;
+import org.opensearch.client.opensearch._types.SortOrder;
+import org.opensearch.client.opensearch.core.SearchRequest;
+import org.opensearch.client.opensearch.core.SearchResponse;
+import org.opensearch.client.opensearch.core.search.Hit;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -90,7 +87,7 @@ public void emptyQueue(String queueName) {
         // would just overload OpenSearch and yield
         // mainly duplicates
         if (isInQuery.get()) {
-            LOG.trace("{} isInquery true", logIdprefix, queueName);
+            LOG.trace("{} isInquery true for {}", logIdprefix, queueName);
             return;
         }
 
@@ -101,57 +98,88 @@ public void emptyQueue(String queueName) {
             lastTimeResetToNow = Instant.now();
         }
 
-        String formattedQueryDate = ISODateTimeFormat.dateTimeNoMillis().print(queryDate.getTime());
-
-        BoolQueryBuilder queryBuilder =
-                boolQuery()
-                        .filter(QueryBuilders.rangeQuery("nextFetchDate").lte(formattedQueryDate));
-
-        queryBuilder.filter(QueryBuilders.termQuery(partitionField, queueName));
-
-        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
-        sourceBuilder.query(queryBuilder);
-        sourceBuilder.from(0);
-        sourceBuilder.size(bufferReloadSize);
-        sourceBuilder.explain(false);
-        sourceBuilder.trackTotalHits(false);
+        String formattedQueryDate =
+                Instant.ofEpochMilli(queryDate.getTime())
+                        .atOffset(ZoneOffset.UTC)
+                        .format(DateTimeFormatter.ISO_OFFSET_DATE_TIME);
+
+        SearchRequest.Builder requestBuilder =
+                new SearchRequest.Builder()
+                        .index(indexName)
+                        .size(bufferReloadSize)
+                        .trackTotalHits(t -> t.enabled(false))
+                        .query(
+                                q ->
+                                        q.bool(
+                                                b ->
+                                                        b.filter(
+                                                                        f ->
+                                                                                f.range(
+                                                                                        r ->
+                                                                                                r.field(
+                                                                                                                "nextFetchDate")
+                                                                                                        .lte(
+                                                                                                                JsonData
+                                                                                                                        .of(
+                                                                                                                                formattedQueryDate))))
+                                                                .filter(
+                                                                        f ->
+                                                                                f.term(
+                                                                                        t ->
+                                                                                                t.field(
+                                                                                                                partitionField)
+                                                                                                        .value(
+                                                                                                                FieldValue
+                                                                                                                        .of(
+                                                                                                                                queueName))))));
 
         // sort within a bucket
         for (String bsf : bucketSortField) {
-            FieldSortBuilder sorter = SortBuilders.fieldSort(bsf).order(SortOrder.ASC);
-            sourceBuilder.sort(sorter);
+            requestBuilder.sort(s -> s.field(fs -> fs.field(bsf).order(SortOrder.Asc)));
         }
 
         // do we have a search after for this one?
         Object[] searchAfterValues = searchAfterCache.getIfPresent(queueName);
         if (searchAfterValues != null) {
-            sourceBuilder.searchAfter(searchAfterValues);
+            for (Object sav : searchAfterValues) {
+                requestBuilder.searchAfter(sav.toString());
+            }
         }
 
-        SearchRequest request = new SearchRequest(indexName);
-
-        request.source(sourceBuilder);
-
-        // https://www.elastic.co/guide/en/opensearch/reference/current/search-request-preference.html
-        // _shards:2,3
-        // specific shard but ideally a local copy of it
+        // shard preference for routing
         if (shardID != -1) {
-            request.preference("_shards:" + shardID + "|_local");
+            requestBuilder.preference("_shards:" + shardID + "|_local");
         }
 
-        // dump query to log
-        LOG.debug("{} OpenSearch query {} - {}", logIdprefix, queueName, request.toString());
+        SearchRequest request = requestBuilder.build();
 
-        client.searchAsync(request, RequestOptions.DEFAULT, hrl);
+        // dump query to log
+        LOG.debug("{} OpenSearch query {} - {}", logIdprefix, queueName, request);
+
+        CompletableFuture.supplyAsync(
+                        () -> {
+                            try {
+                                return client.search(request, JsonData.class);
+                            } catch (IOException e) {
+                                throw new CompletionException(e);
+                            }
+                        })
+                .thenAccept(hrl::handleResponse)
+                .exceptionally(
+                        e -> {
+                            Throwable cause = e instanceof CompletionException ? e.getCause() : e;
+                            LOG.error("Exception with OpenSearch query", cause);
+                            return null;
+                        });
     }
 
     /** Overrides the handling of responses for aggregations. */
     @Override
-    public void onResponse(SearchResponse response) {
+    protected void handleResponse(SearchResponse response) {
         // delete all entries from the searchAfterCache when
         // we get the results from the aggregation spouts
         searchAfterCache.invalidateAll();
-        super.onResponse(response);
+        super.handleResponse(response);
     }
 
     /** The aggregation kindly told us where to start from. */
@@ -163,40 +191,55 @@ protected void sortValuesForKey(String key, Object[] sortValues) {
     }
 
     /** Handling of results for a specific queue. */
-    class HostResultListener implements ActionListener {
+    class HostResultListener {
 
-        @Override
-        public void onResponse(SearchResponse response) {
+        /**
+         * Handles the search response for a host-specific query, extracting hits and adding them to
+         * the buffer.
+         *
+         * @param response the search response containing document hits
+         */
+        void handleResponse(SearchResponse response) {
 
             int alreadyprocessed = 0;
             int numDocs = 0;
 
-            SearchHit[] hits = response.getHits().getHits();
+            List> hits = response.hits().hits();
 
             Object[] sortValues = null;
 
             // retrieve the key for these results
             String key = null;
 
-            for (SearchHit hit : hits) {
+            for (Hit hit : hits) {
                 numDocs++;
+
+                @SuppressWarnings("unchecked")
+                Map sourceAsMap =
+                        (Map) hit.source().to(Object.class);
+
                 String pfield = partitionField;
-                Map sourceAsMap = hit.getSourceAsMap();
+                Map fieldSource = sourceAsMap;
                 if (pfield.startsWith("metadata.")) {
-                    sourceAsMap = (Map) sourceAsMap.get("metadata");
+                    @SuppressWarnings("unchecked")
+                    Map metadataMap =
+                            (Map) sourceAsMap.get("metadata");
+                    fieldSource = metadataMap;
                     pfield = pfield.substring(9);
                 }
-                Object key_as_object = sourceAsMap.get(pfield);
+                Object key_as_object = fieldSource.get(pfield);
                 if (key_as_object instanceof List) {
-                    if (((List) (key_as_object)).size() == 1) {
-                        key = ((List) key_as_object).get(0);
+                    @SuppressWarnings("unchecked")
+                    List keyList = (List) key_as_object;
+                    if (keyList.size() == 1) {
+                        key = keyList.get(0);
                     }
                 } else {
                     key = key_as_object.toString();
                 }
 
-                sortValues = hit.getSortValues();
-                if (!addHitToBuffer(hit)) {
+                sortValues = hit.sort().toArray();
+                if (!addHitToBuffer(sourceAsMap)) {
                     alreadyprocessed++;
                 }
             }
@@ -214,14 +257,9 @@ public void onResponse(SearchResponse response) {
                     "{} OpenSearch term query returned {} hits  in {} msec with {} already being processed for {}",
                     logIdprefix,
                     numDocs,
-                    response.getTook().getMillis(),
+                    response.took(),
                     alreadyprocessed,
                     key);
         }
-
-        @Override
-        public void onFailure(Exception e) {
-            LOG.error("Exception with OpenSearch query", e);
-        }
     }
 }
diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java
index bd178f7db..a3f1d1abf 100644
--- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java
+++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java
@@ -22,8 +22,8 @@
 import com.github.benmanes.caffeine.cache.RemovalCause;
 import com.github.benmanes.caffeine.cache.RemovalListener;
 import java.io.IOException;
-import java.util.Arrays;
 import java.util.Date;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
@@ -41,6 +41,7 @@
 import org.apache.storm.task.TopologyContext;
 import org.apache.storm.tuple.Tuple;
 import org.apache.stormcrawler.Metadata;
+import org.apache.stormcrawler.opensearch.AsyncBulkProcessor;
 import org.apache.stormcrawler.opensearch.BulkItemResponseToFailedFlag;
 import org.apache.stormcrawler.opensearch.Constants;
 import org.apache.stormcrawler.opensearch.IndexCreation;
@@ -52,15 +53,9 @@
 import org.apache.stormcrawler.util.URLPartitioner;
 import org.jetbrains.annotations.NotNull;
 import org.jetbrains.annotations.Nullable;
-import org.opensearch.action.DocWriteRequest;
-import org.opensearch.action.bulk.BulkItemResponse;
-import org.opensearch.action.bulk.BulkProcessor;
-import org.opensearch.action.bulk.BulkRequest;
-import org.opensearch.action.bulk.BulkResponse;
-import org.opensearch.action.index.IndexRequest;
-import org.opensearch.common.xcontent.XContentFactory;
-import org.opensearch.core.rest.RestStatus;
-import org.opensearch.core.xcontent.XContentBuilder;
+import org.opensearch.client.opensearch.core.BulkRequest;
+import org.opensearch.client.opensearch.core.BulkResponse;
+import org.opensearch.client.opensearch.core.bulk.BulkOperation;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -69,7 +64,7 @@
  * 'status' stream. To be used in combination with a Spout to read from the index.
  */
 public class StatusUpdaterBolt extends AbstractStatusUpdaterBolt
-        implements RemovalListener>, BulkProcessor.Listener {
+        implements RemovalListener>, AsyncBulkProcessor.Listener {
 
     private static final Logger LOG = LoggerFactory.getLogger(StatusUpdaterBolt.class);
 
@@ -190,7 +185,7 @@ public void prepare(
         try {
             connection = OpenSearchConnection.getConnection(stormConf, OSBoltType, this);
         } catch (Exception e1) {
-            LOG.error("Can't connect to ElasticSearch", e1);
+            LOG.error("Can't connect to OpenSearch", e1);
             throw new RuntimeException(e1);
         }
 
@@ -244,16 +239,16 @@ public void store(
             return;
         }
 
-        XContentBuilder builder = XContentFactory.jsonBuilder().startObject();
-        builder.field("url", url);
-        builder.field("status", status);
+        Map doc = new HashMap<>();
+        doc.put("url", url);
+        doc.put("status", status.name());
 
-        builder.startObject("metadata");
+        Map metadataMap = new HashMap<>();
         for (String mdKey : metadata.keySet()) {
             String[] values = metadata.getValues(mdKey);
             // periods are not allowed - replace with %2E
             mdKey = mdKey.replaceAll("\\.", "%2E");
-            builder.array(mdKey, values);
+            metadataMap.put(mdKey, List.of(values));
         }
 
         String partitionKey = partitioner.getPartition(url, metadata);
@@ -263,32 +258,51 @@ public void store(
 
         // store routing key in metadata?
         if (StringUtils.isNotBlank(fieldNameForRoutingKey) && routingFieldNameInMetadata) {
-            builder.field(fieldNameForRoutingKey, partitionKey);
+            metadataMap.put(fieldNameForRoutingKey, partitionKey);
         }
 
-        builder.endObject();
+        doc.put("metadata", metadataMap);
 
         // store routing key outside metadata?
         if (StringUtils.isNotBlank(fieldNameForRoutingKey) && !routingFieldNameInMetadata) {
-            builder.field(fieldNameForRoutingKey, partitionKey);
+            doc.put(fieldNameForRoutingKey, partitionKey);
         }
 
         if (nextFetch.isPresent()) {
-            builder.timeField("nextFetchDate", nextFetch.get());
+            doc.put("nextFetchDate", nextFetch.get().toInstant().toString());
         }
-
-        builder.endObject();
-
-        IndexRequest request = new IndexRequest(getIndexName(metadata));
-
         // check that we don't overwrite an existing entry
         // When create is used, the index operation will fail if a document
         // by that id already exists in the index.
         final boolean create = status.equals(Status.DISCOVERED);
-        request.source(builder).id(documentID).create(create);
-
-        if (doRouting) {
-            request.routing(partitionKey);
+        final String targetIndex = getIndexName(metadata);
+        final String routing = doRouting ? partitionKey : null;
+
+        BulkOperation op;
+        if (create) {
+            op =
+                    BulkOperation.of(
+                            b ->
+                                    b.create(
+                                            c -> {
+                                                c.index(targetIndex).id(documentID).document(doc);
+                                                if (routing != null) {
+                                                    c.routing(routing);
+                                                }
+                                                return c;
+                                            }));
+        } else {
+            op =
+                    BulkOperation.of(
+                            b ->
+                                    b.index(
+                                            idx -> {
+                                                idx.index(targetIndex).id(documentID).document(doc);
+                                                if (routing != null) {
+                                                    idx.routing(routing);
+                                                }
+                                                return idx;
+                                            }));
         }
 
         waitAckLock.lock();
@@ -302,7 +316,7 @@ public void store(
 
         LOG.debug("Sending to OpenSearch buffer {} with ID {}", url, documentID);
 
-        connection.addToProcessor(request);
+        connection.addToProcessor(op);
     }
 
     @Override
@@ -320,26 +334,31 @@ public void onRemoval(
 
     @Override
     public void afterBulk(long executionId, BulkRequest request, BulkResponse response) {
-        LOG.debug("afterBulk [{}] with {} responses", executionId, request.numberOfActions());
+        LOG.debug("afterBulk [{}] with {} responses", executionId, request.operations().size());
         eventCounter.scope("bulks_received").incrBy(1);
-        eventCounter.scope("bulk_msec").incrBy(response.getTook().getMillis());
-        eventCounter.scope("received").incrBy(request.numberOfActions());
-        receivedPerSecMetrics.scope("received").update(request.numberOfActions());
+        eventCounter.scope("bulk_msec").incrBy(response.took());
+        eventCounter.scope("received").incrBy(request.operations().size());
+        receivedPerSecMetrics.scope("received").update(request.operations().size());
 
         var idsToBulkItemsWithFailedFlag =
-                Arrays.stream(response.getItems())
+                response.items().stream()
                         .map(
                                 bir -> {
-                                    String id = bir.getId();
-                                    BulkItemResponse.Failure f = bir.getFailure();
+                                    String id = bir.id();
+                                    var error = bir.error();
                                     boolean failed = false;
-                                    if (f != null) {
+                                    if (error != null) {
                                         // already discovered
-                                        if (f.getStatus().equals(RestStatus.CONFLICT)) {
+                                        if (bir.status() == 409) {
                                             eventCounter.scope("doc_conflicts").incrBy(1);
                                             LOG.debug("Doc conflict ID {}", id);
                                         } else {
-                                            LOG.error("Update ID {}, failure: {}", id, f);
+                                            LOG.error(
+                                                    "Update ID {}, failure: {}",
+                                                    id,
+                                                    error.reason() != null
+                                                            ? error.reason()
+                                                            : "unknown");
                                             failed = true;
                                         }
                                     }
@@ -440,13 +459,14 @@ public void afterBulk(long executionId, BulkRequest request, BulkResponse respon
     @Override
     public void afterBulk(long executionId, BulkRequest request, Throwable throwable) {
         eventCounter.scope("bulks_received").incrBy(1);
-        eventCounter.scope("received").incrBy(request.numberOfActions());
-        receivedPerSecMetrics.scope("received").update(request.numberOfActions());
+        eventCounter.scope("received").incrBy(request.operations().size());
+        receivedPerSecMetrics.scope("received").update(request.operations().size());
         LOG.error("Exception with bulk {} - failing the whole lot ", executionId, throwable);
 
         final var failedIds =
-                request.requests().stream()
-                        .map(DocWriteRequest::id)
+                request.operations().stream()
+                        .map(OpenSearchConnection::getBulkOperationId)
+                        .filter(Objects::nonNull)
                         .collect(Collectors.toUnmodifiableSet());
         Map> failedTupleLists;
         waitAckLock.lock();
@@ -476,7 +496,7 @@ public void afterBulk(long executionId, BulkRequest request, Throwable throwable
 
     @Override
     public void beforeBulk(long executionId, BulkRequest request) {
-        LOG.debug("beforeBulk {} with {} actions", executionId, request.numberOfActions());
+        LOG.debug("beforeBulk {} with {} actions", executionId, request.operations().size());
         eventCounter.scope("bulks_sent").incrBy(1);
     }
 
diff --git a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/AsyncBulkProcessorTest.java b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/AsyncBulkProcessorTest.java
new file mode 100644
index 000000000..2fff8e152
--- /dev/null
+++ b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/AsyncBulkProcessorTest.java
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stormcrawler.opensearch;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.Timeout;
+import org.opensearch.client.opensearch.OpenSearchClient;
+import org.opensearch.client.opensearch.core.BulkRequest;
+import org.opensearch.client.opensearch.core.BulkResponse;
+import org.opensearch.client.opensearch.core.bulk.BulkOperation;
+
+class AsyncBulkProcessorTest {
+
+    private static BulkOperation dummyOp() {
+        return BulkOperation.of(b -> b.delete(d -> d.index("idx").id("1")));
+    }
+
+    private static BulkResponse emptyBulkResponse() {
+        return new BulkResponse.Builder()
+                .errors(false)
+                .items(Collections.emptyList())
+                .took(1)
+                .build();
+    }
+
+    private static OpenSearchClient mockClient() throws IOException {
+        OpenSearchClient client = mock(OpenSearchClient.class);
+        when(client.bulk(any(BulkRequest.class))).thenReturn(emptyBulkResponse());
+        return client;
+    }
+
+    /** Verify that a flush is triggered when the bulkActions threshold is reached. */
+    @Test
+    @Timeout(10)
+    void flushAtBulkActionsThreshold() throws Exception {
+        CountDownLatch afterBulkLatch = new CountDownLatch(1);
+        AtomicInteger afterBulkCount = new AtomicInteger(0);
+
+        AsyncBulkProcessor.Listener listener =
+                new AsyncBulkProcessor.Listener() {
+                    @Override
+                    public void beforeBulk(long executionId, BulkRequest request) {}
+
+                    @Override
+                    public void afterBulk(
+                            long executionId, BulkRequest request, BulkResponse response) {
+                        afterBulkCount.incrementAndGet();
+                        afterBulkLatch.countDown();
+                    }
+
+                    @Override
+                    public void afterBulk(
+                            long executionId, BulkRequest request, Throwable failure) {}
+                };
+
+        OpenSearchClient client = mockClient();
+
+        // bulkActions = 3, long flush interval so only threshold triggers
+        AsyncBulkProcessor processor =
+                new AsyncBulkProcessor.Builder(client, listener)
+                        .setBulkActions(3)
+                        .setFlushIntervalMillis(60_000)
+                        .setConcurrentRequests(1)
+                        .build();
+
+        processor.add(dummyOp());
+        processor.add(dummyOp());
+        // third add should trigger flush
+        processor.add(dummyOp());
+
+        assertTrue(afterBulkLatch.await(5, TimeUnit.SECONDS), "afterBulk should have been called");
+        assertEquals(1, afterBulkCount.get());
+
+        processor.awaitClose(5, TimeUnit.SECONDS);
+    }
+
+    /** Verify that the timer-based flush fires even when bulkActions threshold is not reached. */
+    @Test
+    @Timeout(10)
+    void timerBasedFlush() throws Exception {
+        CountDownLatch afterBulkLatch = new CountDownLatch(1);
+
+        AsyncBulkProcessor.Listener listener =
+                new AsyncBulkProcessor.Listener() {
+                    @Override
+                    public void beforeBulk(long executionId, BulkRequest request) {}
+
+                    @Override
+                    public void afterBulk(
+                            long executionId, BulkRequest request, BulkResponse response) {
+                        afterBulkLatch.countDown();
+                    }
+
+                    @Override
+                    public void afterBulk(
+                            long executionId, BulkRequest request, Throwable failure) {}
+                };
+
+        OpenSearchClient client = mockClient();
+
+        // bulkActions very high, short flush interval
+        AsyncBulkProcessor processor =
+                new AsyncBulkProcessor.Builder(client, listener)
+                        .setBulkActions(1000)
+                        .setFlushIntervalMillis(200)
+                        .setConcurrentRequests(1)
+                        .build();
+
+        processor.add(dummyOp());
+
+        // should be flushed by timer within ~200ms
+        assertTrue(
+                afterBulkLatch.await(5, TimeUnit.SECONDS),
+                "timer-based flush should have triggered");
+
+        processor.awaitClose(5, TimeUnit.SECONDS);
+    }
+
+    /** Verify that concurrent requests are limited by the semaphore. */
+    @Test
+    @Timeout(10)
+    void concurrentRequestLimiting() throws Exception {
+        AtomicInteger concurrentCalls = new AtomicInteger(0);
+        AtomicInteger maxConcurrent = new AtomicInteger(0);
+        CountDownLatch allDone = new CountDownLatch(3);
+
+        OpenSearchClient client = mock(OpenSearchClient.class);
+        when(client.bulk(any(BulkRequest.class)))
+                .thenAnswer(
+                        invocation -> {
+                            int current = concurrentCalls.incrementAndGet();
+                            maxConcurrent.updateAndGet(prev -> Math.max(prev, current));
+                            // simulate some work
+                            Thread.sleep(200);
+                            concurrentCalls.decrementAndGet();
+                            allDone.countDown();
+                            return emptyBulkResponse();
+                        });
+
+        AsyncBulkProcessor.Listener listener =
+                new AsyncBulkProcessor.Listener() {
+                    @Override
+                    public void beforeBulk(long executionId, BulkRequest request) {}
+
+                    @Override
+                    public void afterBulk(
+                            long executionId, BulkRequest request, BulkResponse response) {}
+
+                    @Override
+                    public void afterBulk(
+                            long executionId, BulkRequest request, Throwable failure) {}
+                };
+
+        // concurrentRequests = 1 means at most 1 in-flight request
+        AsyncBulkProcessor processor =
+                new AsyncBulkProcessor.Builder(client, listener)
+                        .setBulkActions(1)
+                        .setFlushIntervalMillis(60_000)
+                        .setConcurrentRequests(1)
+                        .build();
+
+        // add 3 operations (each triggers flush since bulkActions=1)
+        processor.add(dummyOp());
+        processor.add(dummyOp());
+        processor.add(dummyOp());
+
+        assertTrue(allDone.await(5, TimeUnit.SECONDS));
+        // with concurrentRequests=1, at most 1 bulk call should execute concurrently
+        assertEquals(1, maxConcurrent.get());
+
+        processor.awaitClose(5, TimeUnit.SECONDS);
+    }
+
+    /** Verify that awaitClose drains remaining buffered operations before returning. */
+    @Test
+    @Timeout(10)
+    void awaitCloseDrainsPending() throws Exception {
+        AtomicInteger totalBulkCalls = new AtomicInteger(0);
+
+        AsyncBulkProcessor.Listener listener =
+                new AsyncBulkProcessor.Listener() {
+                    @Override
+                    public void beforeBulk(long executionId, BulkRequest request) {}
+
+                    @Override
+                    public void afterBulk(
+                            long executionId, BulkRequest request, BulkResponse response) {
+                        totalBulkCalls.incrementAndGet();
+                    }
+
+                    @Override
+                    public void afterBulk(
+                            long executionId, BulkRequest request, Throwable failure) {}
+                };
+
+        OpenSearchClient client = mockClient();
+
+        // bulkActions very high so nothing auto-flushes, long interval
+        AsyncBulkProcessor processor =
+                new AsyncBulkProcessor.Builder(client, listener)
+                        .setBulkActions(1000)
+                        .setFlushIntervalMillis(60_000)
+                        .setConcurrentRequests(1)
+                        .build();
+
+        // add some operations that won't auto-flush
+        processor.add(dummyOp());
+        processor.add(dummyOp());
+
+        // awaitClose should drain the buffer
+        boolean closed = processor.awaitClose(5, TimeUnit.SECONDS);
+        assertTrue(closed, "awaitClose should return true");
+        assertEquals(1, totalBulkCalls.get(), "buffered operations should have been flushed");
+    }
+}
diff --git a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/OpenSearchConnectionTest.java b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/OpenSearchConnectionTest.java
new file mode 100644
index 000000000..8c5789a1c
--- /dev/null
+++ b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/OpenSearchConnectionTest.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stormcrawler.opensearch;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import org.junit.jupiter.api.Test;
+
+class OpenSearchConnectionTest {
+
+    @Test
+    void parseSeconds() {
+        assertEquals(5000, OpenSearchConnection.parseTimeValueToMillis("5s", 0));
+    }
+
+    @Test
+    void parseMilliseconds() {
+        assertEquals(500, OpenSearchConnection.parseTimeValueToMillis("500ms", 0));
+    }
+
+    @Test
+    void parseMinutes() {
+        assertEquals(120000, OpenSearchConnection.parseTimeValueToMillis("2m", 0));
+    }
+
+    @Test
+    void parsePlainNumber() {
+        assertEquals(42, OpenSearchConnection.parseTimeValueToMillis("42", 0));
+    }
+
+    @Test
+    void nullReturnsDefault() {
+        assertEquals(5000, OpenSearchConnection.parseTimeValueToMillis(null, 5000));
+    }
+
+    @Test
+    void emptyReturnsDefault() {
+        assertEquals(5000, OpenSearchConnection.parseTimeValueToMillis("", 5000));
+    }
+
+    @Test
+    void blankReturnsDefault() {
+        assertEquals(5000, OpenSearchConnection.parseTimeValueToMillis("   ", 5000));
+    }
+
+    @Test
+    void invalidReturnsDefault() {
+        assertEquals(3000, OpenSearchConnection.parseTimeValueToMillis("abc", 3000));
+    }
+
+    @Test
+    void invalidWithSuffixReturnsDefault() {
+        assertEquals(3000, OpenSearchConnection.parseTimeValueToMillis("abcs", 3000));
+    }
+
+    @Test
+    void whitespaceIsTrimmed() {
+        assertEquals(5000, OpenSearchConnection.parseTimeValueToMillis("  5s  ", 0));
+    }
+}
diff --git a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/AbstractOpenSearchTest.java b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/AbstractOpenSearchTest.java
index e9c72b336..44150ea67 100644
--- a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/AbstractOpenSearchTest.java
+++ b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/AbstractOpenSearchTest.java
@@ -26,7 +26,8 @@
 @Testcontainers(disabledWithoutDocker = true)
 public abstract class AbstractOpenSearchTest {
 
-    private static final String OPENSEARCH_VERSION = "2.19.4";
+    private static final String OPENSEARCH_VERSION =
+            System.getProperty("opensearch-version", "2.19.5");
 
     public static final String PASSWORD = "This1sAPassw0rd";
 
diff --git a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java
index f8440835d..414d1b984 100644
--- a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java
+++ b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java
@@ -46,12 +46,11 @@
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.Timeout;
-import org.opensearch.action.get.GetRequest;
-import org.opensearch.action.get.GetResponse;
-import org.opensearch.client.RequestOptions;
 import org.opensearch.client.RestClient;
-import org.opensearch.client.RestClientBuilder;
-import org.opensearch.client.RestHighLevelClient;
+import org.opensearch.client.json.jackson.JacksonJsonpMapper;
+import org.opensearch.client.opensearch.OpenSearchClient;
+import org.opensearch.client.opensearch.core.GetResponse;
+import org.opensearch.client.transport.rest_client.RestClientTransport;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -61,7 +60,9 @@ class StatusBoltTest extends AbstractOpenSearchTest {
 
     protected TestOutputCollector output;
 
-    protected org.opensearch.client.RestHighLevelClient client;
+    protected OpenSearchClient client;
+
+    private RestClient restClient;
 
     private static final Logger LOG = LoggerFactory.getLogger(StatusBoltTest.class);
 
@@ -81,12 +82,15 @@ static void afterClass() {
     @BeforeEach
     void setupStatusBolt() throws IOException {
         bolt = new StatusUpdaterBolt();
-        RestClientBuilder builder =
+        restClient =
                 RestClient.builder(
-                        new HttpHost(
-                                opensearchContainer.getHost(),
-                                opensearchContainer.getMappedPort(9200)));
-        client = new RestHighLevelClient(builder);
+                                new HttpHost(
+                                        opensearchContainer.getHost(),
+                                        opensearchContainer.getMappedPort(9200)))
+                        .build();
+        RestClientTransport transport =
+                new RestClientTransport(restClient, new JacksonJsonpMapper());
+        client = new OpenSearchClient(transport);
         // configure the status updater bolt
         Map conf = new HashMap<>();
         conf.put("opensearch.status.routing.fieldname", "metadata.key");
@@ -107,7 +111,7 @@ void close() {
         bolt.cleanup();
         output = null;
         try {
-            client.close();
+            restClient.close();
         } catch (IOException e) {
         }
     }
@@ -129,6 +133,7 @@ private Future store(String url, Status status, Metadata metadata) {
     @Test
     @Timeout(value = 2, unit = TimeUnit.MINUTES)
     // see https://github.com/apache/stormcrawler/issues/885
+    @SuppressWarnings("unchecked")
     void checkListKeyFromOpensearch()
             throws IOException, ExecutionException, InterruptedException, TimeoutException {
         String url = "https://www.url.net/something";
@@ -136,10 +141,10 @@ void checkListKeyFromOpensearch()
         md.addValue("someKey", "someValue");
         store(url, Status.DISCOVERED, md).get(10, TimeUnit.SECONDS);
         assertEquals(1, output.getAckedTuples().size());
-        // check output in Opensearch?
+        // check output in Opensearch
         String id = org.apache.commons.codec.digest.DigestUtils.sha256Hex(url);
-        GetResponse result = client.get(new GetRequest("status", id), RequestOptions.DEFAULT);
-        Map sourceAsMap = result.getSourceAsMap();
+        GetResponse result = client.get(g -> g.index("status").id(id), Map.class);
+        Map sourceAsMap = result.source();
         final String pfield = "metadata.somekey";
         sourceAsMap = (Map) sourceAsMap.get("metadata");
         final var pfieldNew = pfield.substring(9);
diff --git a/pom.xml b/pom.xml
index ed9c61de5..0e0d7daa9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -559,6 +559,7 @@ under the License.
                                 CONTRIBUTING.md
                                 RELEASING.md
                                 external/opensearch/dashboards/**
+                                external/opensearch-java/dashboards/**
                                 external/solr/archetype/src/main/resources/archetype-resources/configsets/**
                                 THIRD-PARTY.properties
                                 THIRD-PARTY.txt

From d530818a51cfdbe4df5e9e8d08a4a83d945b472b Mon Sep 17 00:00:00 2001
From: Davide Polato 
Date: Fri, 10 Apr 2026 17:28:42 +0200
Subject: [PATCH 3/4] feat: Refine new opensearch-java module (#1515)

This commit aligns the opensearch-java module with recent legacy updates,
completes the migration to HC5/API 3.x, and cleans up duplicated resources.

Refactors and Alignment:
- Ported DelegateRefresher for dynamic config reloading (#1870).
- Adopted Storm V2 metrics bridge via CrawlerMetrics (#1846).
- Aligned log messages and metric scopes to OpenSearch (#1871).
- Ported WaitAckCache extraction to centralize bulk-ack logic (#1869).
- Fixed a race condition in IndexerBolt by inverting the execution order,
  ensuring tuples are registered in waitAck before bulk dispatch.
- Refactored BulkItemResponseToFailedFlag to a Java record with a compact
  constructor for strict null-safety.

Maintenance and Cleanup:
- Removed duplicated archetype, dashboards, and opensearch-conf.yaml
  to prevent maintenance overhead.
- Updated README with a migration guide pointing to legacy resources.
- Removed dead rat-exclude in root pom.xml.
---
 THIRD-PARTY.txt                               |  14 +-
 external/opensearch-java/README.md            |  89 ++---
 external/opensearch-java/archetype/pom.xml    |  72 ----
 .../META-INF/archetype-post-generate.groovy   |  21 --
 .../META-INF/maven/archetype-metadata.xml     |  72 ----
 .../archetype-resources/OS_IndexInit.sh       |  40 ---
 .../resources/archetype-resources/README.md   |  80 -----
 .../archetype-resources/crawler-conf.yaml     | 160 ---------
 .../archetype-resources/crawler.flux          | 141 --------
 .../dashboards/importDashboards.sh            |  29 --
 .../dashboards/metrics.ndjson                 |  10 -
 .../dashboards/status.ndjson                  |   5 -
 .../dashboards/storm.ndjson                   |   5 -
 .../archetype-resources/docker-compose.yml    |  81 -----
 .../archetype-resources/injection.flux        |  50 ---
 .../archetype-resources/opensearch-conf.yaml  | 115 ------
 .../resources/archetype-resources/pom.xml     | 149 --------
 .../main/resources/default-regex-filters.txt  |  32 --
 .../resources/default-regex-normalizers.xml   |  78 ----
 .../src/main/resources/indexer.mapping        |  40 ---
 .../src/main/resources/jsoupfilters.json      |  27 --
 .../src/main/resources/metrics.mapping        |  40 ---
 .../src/main/resources/parsefilters.json      |  23 --
 .../src/main/resources/status.mapping         |  39 --
 .../src/main/resources/urlfilters.json        |  60 ----
 .../dashboards/importDashboards.sh            |  29 --
 .../opensearch-java/dashboards/metrics.ndjson |  10 -
 .../opensearch-java/dashboards/status.ndjson  |   5 -
 .../opensearch-java/dashboards/storm.ndjson   |   5 -
 external/opensearch-java/opensearch-conf.yaml | 128 -------
 external/opensearch-java/pom.xml              |  16 +-
 .../BulkItemResponseToFailedFlag.java         |  72 ++--
 .../opensearch/DelegateRefresher.java         | 181 ++++++++++
 .../opensearch/IndexCreation.java             |  44 +--
 .../opensearch/OpenSearchConnection.java      | 262 ++++++--------
 .../stormcrawler/opensearch/WaitAckCache.java | 340 ++++++++++++++++++
 .../opensearch/bolt/DeletionBolt.java         | 200 ++---------
 .../opensearch/bolt/IndexerBolt.java          | 233 ++----------
 .../filtering/JSONURLFilterWrapper.java       | 123 +------
 .../opensearch/metrics/MetricsReporter.java   | 219 +++++++++++
 .../opensearch/metrics/StatusMetricsBolt.java |   9 +-
 .../parse/filter/JSONResourceWrapper.java     | 123 +------
 .../opensearch/persistence/AbstractSpout.java |   2 +-
 .../persistence/AggregationSpout.java         |  14 +-
 .../opensearch/persistence/HybridSpout.java   |   2 +-
 .../persistence/StatusUpdaterBolt.java        | 255 +++----------
 .../opensearch/DelegateRefresherTest.java     | 256 +++++++++++++
 .../opensearch/WaitAckCacheTest.java          | 336 +++++++++++++++++
 .../opensearch/bolt/StatusBoltTest.java       |  18 +-
 .../metrics/MetricsReporterTest.java          |  54 +++
 pom.xml                                       |   2 -
 51 files changed, 1721 insertions(+), 2689 deletions(-)
 delete mode 100644 external/opensearch-java/archetype/pom.xml
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/META-INF/archetype-post-generate.groovy
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml
 delete mode 100755 external/opensearch-java/archetype/src/main/resources/archetype-resources/OS_IndexInit.sh
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/README.md
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler-conf.yaml
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler.flux
 delete mode 100755 external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/importDashboards.sh
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/metrics.ndjson
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/status.ndjson
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/storm.ndjson
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/docker-compose.yml
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/injection.flux
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/pom.xml
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/indexer.mapping
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/metrics.mapping
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/status.mapping
 delete mode 100644 external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json
 delete mode 100755 external/opensearch-java/dashboards/importDashboards.sh
 delete mode 100644 external/opensearch-java/dashboards/metrics.ndjson
 delete mode 100644 external/opensearch-java/dashboards/status.ndjson
 delete mode 100644 external/opensearch-java/dashboards/storm.ndjson
 delete mode 100644 external/opensearch-java/opensearch-conf.yaml
 create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/DelegateRefresher.java
 create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/WaitAckCache.java
 create mode 100644 external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsReporter.java
 create mode 100644 external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java
 create mode 100644 external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/WaitAckCacheTest.java
 create mode 100644 external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/metrics/MetricsReporterTest.java

diff --git a/THIRD-PARTY.txt b/THIRD-PARTY.txt
index 3171d8044..37b3c080a 100644
--- a/THIRD-PARTY.txt
+++ b/THIRD-PARTY.txt
@@ -31,7 +31,6 @@ List of third-party dependencies grouped by their license type.
         * Apache Commons IO (commons-io:commons-io:2.21.0 - https://commons.apache.org/proper/commons-io/)
         * Apache Commons Lang (org.apache.commons:commons-lang3:3.20.0 - https://commons.apache.org/proper/commons-lang/)
         * Apache Commons Logging (commons-logging:commons-logging:1.2 - http://commons.apache.org/proper/commons-logging/)
-        * Apache Commons Logging (commons-logging:commons-logging:1.3.3 - https://commons.apache.org/proper/commons-logging/)
         * Apache Commons Logging (commons-logging:commons-logging:1.3.6 - https://commons.apache.org/proper/commons-logging/)
         * Apache Commons Math (org.apache.commons:commons-math3:3.6.1 - http://commons.apache.org/proper/commons-math/)
         * Apache FontBox (org.apache.pdfbox:fontbox:3.0.7 - http://pdfbox.apache.org/)
@@ -53,10 +52,9 @@ List of third-party dependencies grouped by their license type.
         * Apache HBase Unsafe Wrapper (org.apache.hbase.thirdparty:hbase-unsafe:4.1.12 - https://hbase.apache.org/hbase-unsafe)
         * Apache HttpAsyncClient (org.apache.httpcomponents:httpasyncclient:4.1.5 - http://hc.apache.org/httpcomponents-asyncclient)
         * Apache HttpClient (org.apache.httpcomponents:httpclient:4.5.14 - http://hc.apache.org/httpcomponents-client-ga)
-        * Apache HttpClient (org.apache.httpcomponents.client5:httpclient5:5.3.1 - https://hc.apache.org/httpcomponents-client-5.0.x/5.3.1/httpclient5/)
-        * Apache HttpClient Mime (org.apache.httpcomponents:httpmime:4.5.14 - http://hc.apache.org/httpcomponents-client-ga)
-        * Apache HttpComponents Core HTTP/1.1 (org.apache.httpcomponents.core5:httpcore5:5.2.5 - https://hc.apache.org/httpcomponents-core-5.2.x/5.2.5/httpcore5/)
-        * Apache HttpComponents Core HTTP/2 (org.apache.httpcomponents.core5:httpcore5-h2:5.2.5 - https://hc.apache.org/httpcomponents-core-5.2.x/5.2.5/httpcore5-h2/)
+        * Apache HttpClient (org.apache.httpcomponents.client5:httpclient5:5.6 - https://hc.apache.org/httpcomponents-client-5.5.x/5.6/httpclient5/)
+        * Apache HttpComponents Core HTTP/1.1 (org.apache.httpcomponents.core5:httpcore5:5.4.2 - https://hc.apache.org/httpcomponents-core-5.4.x/5.4.2/httpcore5/)
+        * Apache HttpComponents Core HTTP/2 (org.apache.httpcomponents.core5:httpcore5-h2:5.4.2 - https://hc.apache.org/httpcomponents-core-5.4.x/5.4.2/httpcore5-h2/)
         * Apache HttpCore (org.apache.httpcomponents:httpcore:4.4.16 - http://hc.apache.org/httpcomponents-core-ga)
         * Apache HttpCore NIO (org.apache.httpcomponents:httpcore-nio:4.4.16 - http://hc.apache.org/httpcomponents-core-ga)
         * Apache James :: Mime4j :: Core (org.apache.james:apache-mime4j-core:0.8.13 - http://james.apache.org/mime4j/apache-mime4j-core)
@@ -221,7 +219,7 @@ List of third-party dependencies grouped by their license type.
         * opensearch-compress (org.opensearch:opensearch-compress:2.19.5 - https://github.com/opensearch-project/OpenSearch.git)
         * opensearch-core (org.opensearch:opensearch-core:2.19.5 - https://github.com/opensearch-project/OpenSearch.git)
         * opensearch-geo (org.opensearch:opensearch-geo:2.19.5 - https://github.com/opensearch-project/OpenSearch.git)
-        * OpenSearch Java Client (org.opensearch.client:opensearch-java:2.13.0 - https://github.com/opensearch-project/opensearch-java/)
+        * OpenSearch Java Client (org.opensearch.client:opensearch-java:3.8.0 - https://github.com/opensearch-project/opensearch-java/)
         * opensearch-secure-sm (org.opensearch:opensearch-secure-sm:2.19.5 - https://github.com/opensearch-project/OpenSearch.git)
         * opensearch-task-commons (org.opensearch:opensearch-task-commons:2.19.5 - https://github.com/opensearch-project/OpenSearch.git)
         * opensearch-telemetry (org.opensearch:opensearch-telemetry:2.19.5 - https://github.com/opensearch-project/OpenSearch.git)
@@ -258,7 +256,6 @@ List of third-party dependencies grouped by their license type.
         * Playwright - Main Library (com.microsoft.playwright:playwright:1.58.0 - https://github.com/microsoft/playwright-java/playwright)
         * proto-google-common-protos (com.google.api.grpc:proto-google-common-protos:2.59.2 - https://github.com/googleapis/sdk-platform-java)
         * rank-eval (org.opensearch.plugin:rank-eval-client:2.19.5 - https://github.com/opensearch-project/OpenSearch.git)
-        * rest (org.opensearch.client:opensearch-rest-client:2.12.0 - https://github.com/opensearch-project/OpenSearch.git)
         * rest (org.opensearch.client:opensearch-rest-client:2.19.5 - https://github.com/opensearch-project/OpenSearch.git)
         * rest-high-level (org.opensearch.client:opensearch-rest-high-level-client:2.19.5 - https://github.com/opensearch-project/OpenSearch.git)
         * rome (com.rometools:rome:2.1.0 - http://rometools.com/rome)
@@ -267,7 +264,6 @@ List of third-party dependencies grouped by their license type.
         * Shaded Deps for Storm Client (org.apache.storm:storm-shaded-deps:2.8.5 - https://storm.apache.org/storm-shaded-deps)
         * SnakeYAML (org.yaml:snakeyaml:2.6 - https://bitbucket.org/snakeyaml/snakeyaml)
         * snappy-java (org.xerial.snappy:snappy-java:1.1.10.4 - https://github.com/xerial/snappy-java)
-        * sniffer (org.opensearch.client:opensearch-rest-client-sniffer:2.12.0 - https://github.com/opensearch-project/OpenSearch.git)
         * sniffer (org.opensearch.client:opensearch-rest-client-sniffer:2.19.5 - https://github.com/opensearch-project/OpenSearch.git)
         * SparseBitSet (com.zaxxer:SparseBitSet:1.3 - https://github.com/brettwooldridge/SparseBitSet)
         * storm-autocreds (org.apache.storm:storm-autocreds:2.8.5 - https://storm.apache.org/external/storm-autocreds)
@@ -370,7 +366,7 @@ List of third-party dependencies grouped by their license type.
 
     Eclipse Public License 2.0, GNU General Public License, version 2 with the GNU Classpath Exception
 
-        * Eclipse Parsson (org.eclipse.parsson:parsson:1.1.6 - https://github.com/eclipse-ee4j/parsson/parsson)
+        * Eclipse Parsson (org.eclipse.parsson:parsson:1.1.7 - https://github.com/eclipse-ee4j/parsson/parsson)
         * Jakarta JSON Processing API (jakarta.json:jakarta.json-api:2.1.3 - https://github.com/eclipse-ee4j/jsonp)
         * JSON-B API (jakarta.json.bind:jakarta.json.bind-api:2.0.0 - https://eclipse-ee4j.github.io/jsonb-api)
         * JSON-P Default Provider (org.glassfish:jakarta.json:2.0.0 - https://github.com/eclipse-ee4j/jsonp)
diff --git a/external/opensearch-java/README.md b/external/opensearch-java/README.md
index 159bb29b6..080eef36d 100644
--- a/external/opensearch-java/README.md
+++ b/external/opensearch-java/README.md
@@ -1,70 +1,47 @@
-stormcrawler-opensearch
+stormcrawler-opensearch-java
 ===========================
 
-A collection of resources for [OpenSearch](https://opensearch.org/):
-* [IndexerBolt](https://github.com/apache/stormcrawler/blob/master/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java) for indexing documents crawled with StormCrawler
-* [Spouts](https://github.com/apache/stormcrawler/blob/master/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java) and [StatusUpdaterBolt](https://github.com/apache/stormcrawler/blob/master/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java) for persisting URL information in recursive crawls
-* [MetricsConsumer](https://github.com/apache/stormcrawler/blob/master/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java)
-* [StatusMetricsBolt](https://github.com/apache/stormcrawler/blob/master/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java) for sending the breakdown of URLs per status as metrics and display its evolution over time.
+A collection of resources for [OpenSearch](https://opensearch.org/) built on the
+[OpenSearch Java Client 3.x](https://opensearch.org/docs/latest/clients/java/) and
+Apache HttpClient 5:
 
-as well as resources for building basic real-time monitoring dashboards for the crawls, see below.
+* [IndexerBolt](https://github.com/apache/stormcrawler/blob/master/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java) for indexing documents crawled with StormCrawler
+* [Spouts](https://github.com/apache/stormcrawler/blob/master/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java) and [StatusUpdaterBolt](https://github.com/apache/stormcrawler/blob/master/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java) for persisting URL information in recursive crawls
+* [MetricsConsumer](https://github.com/apache/stormcrawler/blob/master/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java)
+* [StatusMetricsBolt](https://github.com/apache/stormcrawler/blob/master/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java) for sending the breakdown of URLs per status as metrics and display its evolution over time.
 
-This module is ported from the Elasticsearch one.
+This module is functionally equivalent to the legacy `external/opensearch` module
+(which is based on the deprecated `RestHighLevelClient` and HttpClient 4), but
+uses the typed `OpenSearchClient` and the `ApacheHttpClient5TransportBuilder`
+transport. Unlike the legacy client, the Java Client 3.x no longer ships a
+sniffer nor a built-in `BulkProcessor`; this module provides an internal
+`AsyncBulkProcessor` that preserves the same semantics (size/count/time based
+flushing, back-pressure, listener callbacks).
 
 Getting started
 ---------------------
 
-The easiest way is currently to use the archetype for OpenSearch with:
-
-`mvn archetype:generate -DarchetypeGroupId=org.apache.stormcrawler -DarchetypeArtifactId=stormcrawler-opensearch-archetype -DarchetypeVersion=3.4.0`
-
-You'll be asked to enter a groupId (e.g. com.mycompany.crawler), an artefactId (e.g. stormcrawler), a version, a package name and details about the user agent to use.
-
-This will not only create a fully formed project containing a POM with the dependency above but also a set of resources, configuration files and a topology class. Enter the directory you just created (should be the same as the artefactId you specified earlier) and follow the instructions on the README file.
-
-You will of course need to have both Storm and OpenSearch installed. For the latter, the [OpenSearch documentation](https://opensearch.org/docs/latest/install-and-configure/install-opensearch/docker/) contains resources for Docker.
-
-Unlike in the Elastic module, the schemas are automatically created by the bolts. You can of course override them by using the script 'OS_IndexInit.sh' generated by the archetype, the index definitions are located in _src/main/resources_.
-
-
-Dashboards
----------------------
-
-To import the dashboards into a local instance of OpenSearch Dashboard, go into the folder _dashboards_ and run the script _importDashboards.sh_. 
-
-You should see something like 
+Add the dependency to your crawler project:
 
+```xml
+
+    org.apache.stormcrawler
+    stormcrawler-opensearch-java
+    ${stormcrawler.version}
+
 ```
-Importing status dashboard into OpenSearch Dashboards
-{"successCount":4,"success":true,"successResults":[{"type":"index-pattern","id":"7445c390-7339-11e9-9289-ffa3ee6775e4","meta":{"title":"status","icon":"indexPatternApp"}},{"type":"visualization","id":"status-count","meta":{"title":"status count","icon":"visualizeApp"}},{"type":"visualization","id":"Top-Hosts","meta":{"title":"Top Hosts","icon":"visualizeApp"}},{"type":"dashboard","id":"Crawl-status","meta":{"title":"Crawl status","icon":"dashboardApp"}}]}
-Importing metrics dashboard into OpenSearch Dashboards
-{"successCount":9,"success":true,"successResults":[{"type":"index-pattern","id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","meta":{"title":"metrics","icon":"indexPatternApp"}},{"type":"visualization","id":"Fetcher-:-#-active-threads","meta":{"title":"Fetcher : # active threads","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-num-queues","meta":{"title":"Fetcher : num queues","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-pages-fetched","meta":{"title":"Fetcher : pages fetched","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-URLs-waiting-in-queues","meta":{"title":"Fetcher : URLs waiting in queues","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-average-bytes-per-second","meta":{"title":"Fetcher : average bytes per second","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-average-pages-per-second","meta":{"title":"Fetcher : average pages per second","icon":"visualizeApp"}},{"type":"visualization","id":"Total-bytes-fetched","meta":{"title":"Total bytes fetched","icon":"visualizeApp"}},{"type":"dashboard","id":"Crawl-metrics","meta":{"title":"Crawl metrics","icon":"dashboardApp"}}]}
-
-```
-
-The [dashboard screen](http://localhost:5601/app/dashboards#/list?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-15m,to:now))) should show both the status and metrics dashboards. If you click on `Crawl Status`, you should see 2 tables containing the count of URLs per status and the top hostnames per URL count.
-The [Metrics dashboard](http://localhost:5601/app/dashboards#/view/Crawl-metrics) can be used to monitor the progress of the crawl.
-
-The file _storm.ndjson_ is used to display some of Storm's internal metrics and is not added by default.
-
-#### Per time period metric indices (optional)
-
-The _metrics_ index can be configured per time period. This best practice is [discussed on the Elastic website](https://www.elastic.co/guide/en/elasticsearch/guide/current/time-based.html).
-
-The crawler config YAML must be updated to use an optional argument as shown below to have one index per day:
-
-```
- #Metrics consumers:
-    topology.metrics.consumer.register:
-         - class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer"
-           parallelism.hint: 1
-           argument: "yyyy-MM-dd"
-```
-
-
-
-
 
+You will of course need to have both Storm and OpenSearch installed. For the
+latter, see the [OpenSearch documentation](https://opensearch.org/docs/latest/install-and-configure/install-opensearch/docker/)
+for Docker-based setups.
 
+Schemas are automatically created by the bolts on first use; you can override
+them by providing your own index definitions before starting the topology.
 
+Configuration and dashboards
+---------------------
 
+For a ready-to-use crawler configuration, example Flux topologies, index
+initialization scripts and OpenSearch Dashboards exports, refer to the
+[`external/opensearch`](../opensearch) module: all of those resources are
+compatible with this module and have not been duplicated here.
diff --git a/external/opensearch-java/archetype/pom.xml b/external/opensearch-java/archetype/pom.xml
deleted file mode 100644
index 10b4090de..000000000
--- a/external/opensearch-java/archetype/pom.xml
+++ /dev/null
@@ -1,72 +0,0 @@
-
-
-
-
-
-    4.0.0
-
-    
-        org.apache.stormcrawler
-        stormcrawler
-        3.5.2-SNAPSHOT
-        ../../../pom.xml
-    
-
-    stormcrawler-opensearch-java-archetype
-
-    maven-archetype
-
-    
-
-        
-            
-                src/main/resources
-                true
-                
-                    META-INF/maven/archetype-metadata.xml
-                
-            
-            
-                src/main/resources
-                false
-                
-                    META-INF/maven/archetype-metadata.xml
-                
-            
-        
-
-        
-            
-                org.apache.maven.archetype
-                archetype-packaging
-                3.4.1
-            
-        
-
-        
-            
-                
-                    maven-archetype-plugin
-                    3.4.1
-                
-            
-        
-    
-
diff --git a/external/opensearch-java/archetype/src/main/resources/META-INF/archetype-post-generate.groovy b/external/opensearch-java/archetype/src/main/resources/META-INF/archetype-post-generate.groovy
deleted file mode 100644
index bbdb54974..000000000
--- a/external/opensearch-java/archetype/src/main/resources/META-INF/archetype-post-generate.groovy
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to you under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-def file1 = new File(request.getOutputDirectory(), request.getArtifactId() + "/dashboards/importDashboards.sh")
-file1.setExecutable(true, false)
-
-def file2 = new File(request.getOutputDirectory(), request.getArtifactId() + "/OS_IndexInit.sh")
-file2.setExecutable(true, false)
diff --git a/external/opensearch-java/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml b/external/opensearch-java/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml
deleted file mode 100644
index 4f58adcd6..000000000
--- a/external/opensearch-java/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml
+++ /dev/null
@@ -1,72 +0,0 @@
-
-
-
-
-
-
-    
-        
-            ^[a-zA-Z_\-]+$
-        
-        
-        
-        
-        
-            ^\S+@\S+\.\S+$
-        
-        
-            ${project.version}
-        
-    
-
-    
-        
-            src/main/resources
-            
-                **/*.xml
-                **/*.txt
-                **/*.yaml
-                **/*.json
-                **/*.mapping
-            
-        
-        
-            
-            
-                README.md
-                *.flux
-                *.yaml
-                *.sh
-            
-        
-        
-            dashboards
-            
-                *.sh
-                *.ndjson
-            
-        
-    
-
-
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/OS_IndexInit.sh b/external/opensearch-java/archetype/src/main/resources/archetype-resources/OS_IndexInit.sh
deleted file mode 100755
index 69698c1a8..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/OS_IndexInit.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/bin/bash
-
-OSHOST=${1:-"http://localhost:9200"}
-OSCREDENTIALS=${2:-"-u opensearch:passwordhere"}
-
-curl $OSCREDENTIALS -s -XDELETE "$OSHOST/status/" >  /dev/null
-echo "Deleted 'status' index, now recreating it..."
-curl $OSCREDENTIALS -s -XPUT "$OSHOST/status" -H 'Content-Type: application/json' --upload-file src/main/resources/status.mapping
-
-echo ""
-
-curl $OSCREDENTIALS -s -XDELETE "$OSHOST/content/" >  /dev/null
-echo "Deleted 'content' index, now recreating it..."
-curl $OSCREDENTIALS -s -XPUT "$OSHOST/content" -H 'Content-Type: application/json' --upload-file src/main/resources/indexer.mapping
-
-### metrics
-
-curl $OSCREDENTIALS -s -XDELETE "$OSHOST/metrics*/" >  /dev/null
-
-echo "Deleted 'metrics' index, now recreating it..."
-
-# http://localhost:9200/metrics/_mapping/status?pretty
-curl $OSCREDENTIALS -s -XPOST "$OSHOST/_template/metrics-template" -H 'Content-Type: application/json' --upload-file src/main/resources/metrics.mapping
-
-echo ""
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/README.md b/external/opensearch-java/archetype/src/main/resources/archetype-resources/README.md
deleted file mode 100644
index ddd7be949..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/README.md
+++ /dev/null
@@ -1,80 +0,0 @@
-This has been generated by the StormCrawler Maven Archetype as a starting point for building your own crawler with [OpenSearch](https://opensearch.org/) as a backend.
-Have a look at the code and resources and modify them to your heart's content. 
-
-# Prerequisites
-
-## Native
-You need to have Apache Storm installed, as well as a running instance of OpenSearch.
-
-## Docker Compose
-
-We provide a simple `docker-compose.yaml` file to launch OpenSearch, Zookeeper, Storm Nimbus, Storm Supervisor, and the Storm UI.
-You may need to update `opensearch-conf.yaml` to reference the OpenSearch host configuration (Docker container name).
-
-# Compilation
-
-First generate an uberjar:
-
-``` sh
-mvn clean package
-```
-
-# URL injection
-
-The first step consists in creating a file _seeds.txt_ in the current directory and populating it with the URLs 
-to be used as a starting point for the crawl, e.g. 
-
-`echo "http://stormcrawler.net/" > seeds.txt`
-
-You can start the crawl topology in local mode using the URLs in _seeds.txt_ as a starting point with
-
-``` sh
-storm local target/${artifactId}-${version}.jar  org.apache.storm.flux.Flux injection.flux --local-ttl 3600
-```
-
-Note that in local mode, Flux uses a default TTL for the topology of 20 secs. The command above runs the topology for 1 hour.
-
-# Running the crawl
-
-To start crawling, run the following command
-
-``` sh
-storm jar target/${artifactId}-${version}.jar  org.apache.storm.flux.Flux crawler.flux
-```
-
-Note that in the previous command, we ran the topology with `storm jar` to benefit from the Storm UI and logging. In that case, the topology runs continuously, as intended.
-If you don't have a Storm cluster set up and/or want to run in local mode, simply replace _jar_ with _local_ and add _--local-ttl 3600_.
-
-
-Index definitions
----------------------
-
-Unlike in the Elastic module, the schemas are automatically created by the bolts. You can of course override them by using the script 'OS_IndexInit.sh', the index definitions are located in _src/main/resources_.
-
-
-Dashboards
----------------------
-
-To import the dashboards into a local instance of OpenSearch Dashboards, go into the folder _dashboards_ and run the script _importDashboards.sh_. 
-
-You should see something like 
-
-```
-Importing status dashboard into OpenSearch Dashboards
-{"successCount":4,"success":true,"successResults":[{"type":"index-pattern","id":"7445c390-7339-11e9-9289-ffa3ee6775e4","meta":{"title":"status","icon":"indexPatternApp"}},{"type":"visualization","id":"status-count","meta":{"title":"status count","icon":"visualizeApp"}},{"type":"visualization","id":"Top-Hosts","meta":{"title":"Top Hosts","icon":"visualizeApp"}},{"type":"dashboard","id":"Crawl-status","meta":{"title":"Crawl status","icon":"dashboardApp"}}]}
-Importing metrics dashboard into OpenSearch Dashboards
-{"successCount":9,"success":true,"successResults":[{"type":"index-pattern","id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","meta":{"title":"metrics","icon":"indexPatternApp"}},{"type":"visualization","id":"Fetcher-:-#-active-threads","meta":{"title":"Fetcher : # active threads","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-num-queues","meta":{"title":"Fetcher : num queues","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-pages-fetched","meta":{"title":"Fetcher : pages fetched","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-URLs-waiting-in-queues","meta":{"title":"Fetcher : URLs waiting in queues","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-average-bytes-per-second","meta":{"title":"Fetcher : average bytes per second","icon":"visualizeApp"}},{"type":"visualization","id":"Fetcher-:-average-pages-per-second","meta":{"title":"Fetcher : average pages per second","icon":"visualizeApp"}},{"type":"visualization","id":"Total-bytes-fetched","meta":{"title":"Total bytes fetched","icon":"visualizeApp"}},{"type":"dashboard","id":"Crawl-metrics","meta":{"title":"Crawl metrics","icon":"dashboardApp"}}]}
-
-```
-
-The [dashboard screen](http://localhost:5601/app/dashboards#/list?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-15m,to:now))) should show both the status and metrics dashboards. If you click on `Crawl Status`, you should see 2 tables containing the count of URLs per status and the top hostnames per URL count.
-The [Metrics dashboard](http://localhost:5601/app/dashboards#/view/Crawl-metrics) can be used to monitor the progress of the crawl.
-
-The file _storm.ndjson_ is used to display some of Storm's internal metrics and is not added by default.
-
-
-
-Happy crawling! If you have any questions, please ask on [StackOverflow with the tag stormcrawler](http://stackoverflow.com/questions/tagged/stormcrawler) or the [discussions](https://github.com/apache/stormcrawler/discussions) section on GitHub.
-
-
-
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler-conf.yaml b/external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler-conf.yaml
deleted file mode 100644
index f62103faf..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler-conf.yaml
+++ /dev/null
@@ -1,160 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Custom configuration for StormCrawler
-# This is used to override the default values from crawler-default.xml and provide additional ones
-# for your custom components.
-# Use this file with the parameter -conf when launching your extension of ConfigurableTopology.
-# This file does not contain all the key values but only the most frequently used ones. See crawler-default.xml for an extensive list.
-
-config:
-  topology.workers: 1
-  topology.message.timeout.secs: 300
-  topology.max.spout.pending: 100
-  topology.debug: false
-
-  fetcher.threads.number: 50
-
-  # override the JVM parameters for the workers
-  topology.worker.childopts: "-Xmx2g -Djava.net.preferIPv4Stack=true"
-
-  # mandatory when using Flux
-  topology.kryo.register:
-    - org.apache.stormcrawler.Metadata
-    - org.apache.stormcrawler.persistence.Status
-
-  # Lists the metadata to transfer to outlinks
-  # Used by Fetcher and SiteMapParser for redirections,
-  # discovered links, passing cookies to child pages, etc.
-  # These are also persisted for the parent document (see below).
-  # Allows wildcards, eg. "follow.*" transfers all metadata starting with "follow.".
-  # metadata.transfer:
-  # - customMetadataName
-
-  # Lists the metadata to persist to storage
-  # These are not transferred to the outlinks. Also allows wildcards, eg. "follow.*".
-  metadata.persist:
-   - _redirTo
-   - error.cause
-   - error.source
-   - isSitemap
-   - isFeed
-
-  # Agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
-  # The full user agent value sent as part of the HTTP requests
-  # is built from the elements below. Only the agent.name is mandatory,
-  # it is also used to parse the robots.txt directives.
-
-  # The agent name must be compliant with RFC 9309 (section 2.2.1)
-  # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-")
-  http.agent.name: "${http-agent-name}"
-  # version of your crawler
-  http.agent.version: "${http-agent-version}"
-  # description of what it does
-  http.agent.description: "${http-agent-description}"
-  # URL webmasters can go to to learn about it
-  http.agent.url: "${http-agent-url}"
-  # Finally, an email so that they can get in touch with you
-  http.agent.email: "${http-agent-email}"
-
-  http.protocol.implementation: "org.apache.stormcrawler.protocol.okhttp.HttpProtocol"
-  https.protocol.implementation: "org.apache.stormcrawler.protocol.okhttp.HttpProtocol"
-
-  # The maximum number of bytes for returned HTTP response bodies.
-  # The fetched page will be trimmed to 65KB in this case
-  # Set -1 to disable the limit.
-  http.content.limit: 65536
-
-  sitemap.discovery: true
-
-  # FetcherBolt queue dump => comment out to activate
-  # if a file exists on the worker machine with the corresponding port number
-  # the FetcherBolt will log the content of its internal queues to the logs
-  # fetcherbolt.queue.debug.filepath: "/tmp/fetcher-dump-{port}"
-
-  parsefilters.config.file: "parsefilters.json"
-  urlfilters.config.file: "urlfilters.json"
-  jsoup.filters.config.file: "jsoupfilters.json"
-
-  # revisit a page daily (value in minutes)
-  # set it to -1 to never refetch a page
-  fetchInterval.default: 1440
-
-  # revisit a page with a fetch error after 2 hours (value in minutes)
-  # set it to -1 to never refetch a page
-  fetchInterval.fetch.error: 120
-
-  # never revisit a page with an error (or set a value in minutes)
-  fetchInterval.error: -1
-
-  # set to true if you don't need any text to be extracted by JSoup
-  textextractor.no.text: false
-
-  # text extraction for JSoupParserBolt
-  textextractor.include.pattern:
-   - DIV[id="maincontent"]
-   - DIV[itemprop="articleBody"]
-   - ARTICLE
-
-  textextractor.exclude.tags:
-   - STYLE
-   - SCRIPT
-
-  # needed for parsing with Tika
-  jsoup.treat.non.html.as.error: false
-
-  # restricts the documents types to be parsed with Tika
-  parser.mimetype.whitelist:
-   - application/.+word.*
-   - application/.+excel.*
-   - application/.+powerpoint.*
-   - application/.*pdf.*
-
-  # Tika parser configuration file
-  parse.tika.config.file: "tika-config.xml"
-
-  # custom fetch interval to be used when a document has the key/value in its metadata
-  # and has been fetched successfully (value in minutes)
-  # fetchInterval.FETCH_ERROR.isFeed=true: 30
-  # fetchInterval.isFeed=true: 10
-
-  # configuration for the classes extending AbstractIndexerBolt
-  # indexer.md.filter: "someKey=aValue"
-  indexer.url.fieldname: "url"
-  indexer.text.fieldname: "content"
-  indexer.canonical.name: "canonical"
-  # How to convert metadata key values into fields for indexing
-  #
-  # if no alias is specified with =alias, the key value is used
-  # for instance below, _domain_ and _format_ will be used
-  # as field names, whereas _title_ will be used for _parse.title_.
-  # You can specify the index of the value to store from the values array
-  # by using the _key[index]_ format, e.g. _parse.title[0]_ would try to
-  # get the first value for the metadata _parse.title_ (which is the default anyway).
-  # Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
-  # index all the keys with _parse_ as a prefix. Note that in that case, you can't
-  # specify an alias with =, nor can you specify an index.
-  indexer.md.mapping:
-  - parse.title=title
-  - parse.keywords=keywords
-  - parse.description=description
-  - domain
-  - format
-
-  # Metrics consumers:
-  topology.metrics.consumer.register:
-     - class: "org.apache.storm.metric.LoggingMetricsConsumer"
-       parallelism.hint: 1
-
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler.flux b/external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler.flux
deleted file mode 100644
index 85fb6c655..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/crawler.flux
+++ /dev/null
@@ -1,141 +0,0 @@
-name: "crawler"
-
-includes:
-    - resource: true
-      file: "/crawler-default.yaml"
-      override: false
-
-    - resource: false
-      file: "crawler-conf.yaml"
-      override: true
-
-    - resource: false
-      file: "opensearch-conf.yaml"
-      override: true
-
-spouts:
-  - id: "spout"
-    className: "org.apache.stormcrawler.opensearch.persistence.AggregationSpout"
-    parallelism: 10
-
-bolts:
-  - id: "partitioner"
-    className: "org.apache.stormcrawler.bolt.URLPartitionerBolt"
-    parallelism: 1
-  - id: "fetcher"
-    className: "org.apache.stormcrawler.bolt.FetcherBolt"
-    parallelism: 1
-  - id: "sitemap"
-    className: "org.apache.stormcrawler.bolt.SiteMapParserBolt"
-    parallelism: 1
-  - id: "parse"
-    className: "org.apache.stormcrawler.bolt.JSoupParserBolt"
-    parallelism: 1
-  - id: "shunt"
-    className: "org.apache.stormcrawler.tika.RedirectionBolt"
-    parallelism: 1
-  - id: "tika"
-    className: "org.apache.stormcrawler.tika.ParserBolt"
-    parallelism: 1
-  - id: "index"
-    className: "org.apache.stormcrawler.opensearch.bolt.IndexerBolt"
-    parallelism: 1
-  - id: "status"
-    className: "org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt"
-    parallelism: 1
-  - id: "deleter"
-    className: "org.apache.stormcrawler.opensearch.bolt.DeletionBolt"
-    parallelism: 1
-  - id: "status_metrics"
-    className: "org.apache.stormcrawler.opensearch.metrics.StatusMetricsBolt"
-    parallelism: 1
-
-streams:
-  - from: "spout"
-    to: "partitioner"
-    grouping:
-      type: SHUFFLE
-
-  - from: "__system"
-    to: "status_metrics"
-    grouping:
-      type: SHUFFLE
-      streamId: "__tick"
-
-  - from: "partitioner"
-    to: "fetcher"
-    grouping:
-      type: FIELDS
-      args: ["key"]
-
-  - from: "fetcher"
-    to: "sitemap"
-    grouping:
-      type: LOCAL_OR_SHUFFLE
-
-  - from: "sitemap"
-    to: "parse"
-    grouping:
-      type: LOCAL_OR_SHUFFLE
-
-  - from: "parse"
-    to: "shunt"
-    grouping:
-      type: LOCAL_OR_SHUFFLE
-
-  - from: "shunt"
-    to: "tika"
-    grouping:
-      type: LOCAL_OR_SHUFFLE
-      streamId: "tika"
-
-  - from: "tika"
-    to: "index"
-    grouping:
-      type: LOCAL_OR_SHUFFLE
-
-  - from: "shunt"
-    to: "index"
-    grouping:
-      type: LOCAL_OR_SHUFFLE
-
-  - from: "fetcher"
-    to: "status"
-    grouping:
-      type: FIELDS
-      args: ["url"]
-      streamId: "status"
-
-  - from: "sitemap"
-    to: "status"
-    grouping:
-      type: FIELDS
-      args: ["url"]
-      streamId: "status"
-
-  - from: "parse"
-    to: "status"
-    grouping:
-      type: FIELDS
-      args: ["url"]
-      streamId: "status"
-
-  - from: "tika"
-    to: "status"
-    grouping:
-      type: FIELDS
-      args: ["url"]
-      streamId: "status"
-
-  - from: "index"
-    to: "status"
-    grouping:
-      type: FIELDS
-      args: ["url"]
-      streamId: "status"
-
-  - from: "status"
-    to: "deleter"
-    grouping:
-      type: LOCAL_OR_SHUFFLE
-      streamId: "deletion"
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/importDashboards.sh b/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/importDashboards.sh
deleted file mode 100755
index 561f739c1..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/importDashboards.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/bin/sh
-
-BIN=$(dirname $0)
-
-echo "Importing status dashboard into OpenSearch Dashboards"
-curl -X POST "localhost:5601/api/saved_objects/_import" -H "osd-xsrf: true" --form file=@$BIN/status.ndjson
-echo ""
-
-echo "Importing metrics dashboard into OpenSearch Dashboards"
-curl -X POST "localhost:5601/api/saved_objects/_import" -H "osd-xsrf: true" --form file=@$BIN/metrics.ndjson
-echo ""
-
-# Storm internal metrics
-# curl -X POST "localhost:5601/api/saved_objects/_import" -H "kbn-xsrf: true" --form file=@$BIN/storm.ndjson
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/metrics.ndjson b/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/metrics.ndjson
deleted file mode 100644
index 20cbb2bc0..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/metrics.ndjson
+++ /dev/null
@@ -1,10 +0,0 @@
-{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:activethreads\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : # active threads","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"2\"}}],\"listeners\":{},\"title\":\"Fetcher : # active threads\"}"},"id":"Fetcher-:-#-active-threads","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.178Z","version":"WzksMV0="}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:num_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : num queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : num queues\"}"},"id":"Fetcher-:-num-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.175Z","version":"WzgsMV0="}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : pages fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : pages fetched\"}"},"id":"Fetcher-:-pages-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.170Z","version":"WzcsMV0="}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:in_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : URLs waiting in queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"addLegend\":false,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"mode\":\"grouped\",\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"spyPerPage\":10,\"times\":[],\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"5\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"4\"}}],\"listeners\":{},\"title\":\"Fetcher : URLs waiting in queues\"}"},"id":"Fetcher-:-URLs-waiting-in-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.160Z","version":"WzUsMV0="}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.bytes_fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average bytes per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}}],\"listeners\":{},\"title\":\"Fetcher : average bytes per second\"}"},"id":"Fetcher-:-average-bytes-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.173Z","version":"WzYsMV0="}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average pages per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Fetcher : average pages per second\"}"},"id":"Fetcher-:-average-pages-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.820Z","version":"WzEwLDFd"}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.bytes_fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Total bytes fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"m\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Total bytes fetched\"}"},"id":"Total-bytes-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.823Z","version":"WzExLDFd"}
-{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}},\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelIndex\":\"1\",\"gridData\":{\"x\":24,\"y\":20,\"w\":12,\"h\":12,\"i\":\"1\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_0\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":12,\"y\":20,\"w\":12,\"h\":12,\"i\":\"2\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_1\"},{\"panelIndex\":\"3\",\"gridData\":{\"x\":0,\"y\":0,\"w\":36,\"h\":12,\"i\":\"3\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_2\"},{\"panelIndex\":\"4\",\"gridData\":{\"x\":0,\"y\":20,\"w\":12,\"h\":12,\"i\":\"4\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_3\"},{\"panelIndex\":\"5\",\"gridData\":{\"x\":0,\"y\":40,\"w\":36,\"h\":8,\"i\":\"5\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_4\"},{\"panelIndex\":\"6\",\"gridData\":{\"x\":0,\"y\":32,\"w\":36,\"h\":8,\"i\":\"6\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_5\"},{\"panelIndex\":\"7\",\"gridData\":{\"x\":0,\"y\":12,\"w\":36,\"h\":8,\"i\":\"7\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_6\"}]","timeRestore":false,"title":"Crawl metrics","version":1},"id":"Crawl-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Fetcher-:-#-active-threads","name":"panel_0","type":"visualization"},{"id":"Fetcher-:-num-queues","name":"panel_1","type":"visualization"},{"id":"Fetcher-:-pages-fetched","name":"panel_2","type":"visualization"},{"id":"Fetcher-:-URLs-waiting-in-queues","name":"panel_3","type":"visualization"},{"id":"Fetcher-:-average-bytes-per-second","name":"panel_4","type":"visualization"},{"id":"Fetcher-:-average-pages-per-second","name":"panel_5","type":"visualization"},{"id":"Total-bytes-fetched","name":"panel_6","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:06:58.830Z","version":"WzQsMV0="}
-{"exportedCount":9,"missingRefCount":0,"missingReferences":[]}
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/status.ndjson b/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/status.ndjson
deleted file mode 100644
index b3d0122e4..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/status.ndjson
+++ /dev/null
@@ -1,5 +0,0 @@
-{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"key\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"metadata._redirTo\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.depth\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Ecause\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Esource\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.fetch%2Eerror%2Ecount\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isFeed\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isSitemap\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.url%2Epath\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"nextFetchDate\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"status\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"url\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":true,\"readFromDocValues\":true}]","title":"status"},"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:07:47.130Z","version":"WzEzLDFd"}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"status count","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"status\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"status count\"}"},"id":"status-count","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.278Z","version":"WzE1LDFd"}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Top Hosts","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"key\",\"size\":50,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"Top Hosts\"}"},"id":"Top-Hosts","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.281Z","version":"WzE2LDFd"}
-{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"37874bbf-6607-435a-a231-94d81e9193e7\",\"gridData\":{\"x\":0,\"y\":0,\"w\":16,\"h\":20,\"i\":\"37874bbf-6607-435a-a231-94d81e9193e7\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"4faa5b74-1660-44f7-9227-89d900c8231e\",\"gridData\":{\"x\":16,\"y\":0,\"w\":16,\"h\":20,\"i\":\"4faa5b74-1660-44f7-9227-89d900c8231e\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Crawl status","version":1},"id":"Crawl-status","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"status-count","name":"panel_0","type":"visualization"},{"id":"Top-Hosts","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:07:47.948Z","version":"WzE0LDFd"}
-{"exportedCount":4,"missingRefCount":0,"missingReferences":[]}
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/storm.ndjson b/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/storm.ndjson
deleted file mode 100644
index 1d25d1f6e..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/dashboards/storm.ndjson
+++ /dev/null
@@ -1,5 +0,0 @@
-{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name: \\\"__receive.population\\\"\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Storm Receive Queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcTaskId\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcComponentId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}"},"id":"Storm-Receive-Queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.875Z","version":"WzIwLDFd"}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"+srcComponentId: \\\"__system\\\" +name: memory\\\\/heap*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Memory Heap","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":true,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"name\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"split\",\"params\":{\"field\":\"srcWorkerHost\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\",\"row\":true}}],\"listeners\":{}}"},"id":"Memory-Heap","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.877Z","version":"WzIxLDFd"}
-{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\",\"gridData\":{\"x\":0,\"y\":0,\"w\":32,\"h\":8,\"i\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\",\"gridData\":{\"x\":0,\"y\":8,\"w\":32,\"h\":16,\"i\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Storm metrics","version":1},"id":"Storm-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Storm-Receive-Queues","name":"panel_0","type":"visualization"},{"id":"Memory-Heap","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:08:33.810Z","version":"WzE5LDFd"}
-{"exportedCount":4,"missingRefCount":0,"missingReferences":[]}
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/docker-compose.yml b/external/opensearch-java/archetype/src/main/resources/archetype-resources/docker-compose.yml
deleted file mode 100644
index ccad3cc41..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/docker-compose.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-services:
-  zookeeper:
-    image: zookeeper:3.9.3
-    container_name: zookeeper
-    restart: always
-
-  nimbus:
-    image: storm:latest
-    container_name: nimbus
-    hostname: nimbus
-    command: storm nimbus
-    depends_on:
-      - zookeeper
-    restart: always
-
-  supervisor:
-    image: storm:latest
-    container_name: supervisor
-    command: storm supervisor -c worker.childopts=-Xmx%HEAP-MEM%m
-    depends_on:
-      - nimbus
-      - zookeeper
-    restart: always
-
-  ui:
-    image: storm:latest
-    container_name: ui
-    command: storm ui
-    depends_on:
-      - nimbus
-    restart: always
-    ports:
-      - "127.0.0.1:8080:8080"
-
-  opensearch-sc:
-    image: opensearchproject/opensearch:2.19.4
-    container_name: opensearch-sc
-    environment:
-      - cluster.name=opensearch-sc-cluster
-      - node.name=opensearch-sc
-      - discovery.type=single-node
-      - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping
-      - "OPENSEARCH_JAVA_OPTS=-Xms4G -Xmx4G"
-      - plugins.security.disabled=true
-      - "DISABLE_INSTALL_DEMO_CONFIG=true"
-    volumes:
-      - opensearch-sc-data:/usr/share/opensearch/data
-    ulimits:
-      memlock:
-        soft: -1
-        hard: -1
-      nofile:
-        soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems
-        hard: 65536
-    ports:
-      - "127.0.0.1:9200:9200" # REST API
-
-  opensearch-dashboard:
-    image: opensearchproject/opensearch-dashboards:2.19.4
-    container_name: dashboard
-    ports:
-      - "127.0.0.1:5601:5601"
-    expose:
-      - "5601"
-    environment:
-      - 'OPENSEARCH_HOSTS=["http://opensearch-sc:9200"]'
-      - "DISABLE_SECURITY_DASHBOARDS_PLUGIN=true" # disables security dashboards plugin in OpenSearch Dashboards
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/injection.flux b/external/opensearch-java/archetype/src/main/resources/archetype-resources/injection.flux
deleted file mode 100644
index 060c1052f..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/injection.flux
+++ /dev/null
@@ -1,50 +0,0 @@
-name: "injection"
-
-includes:
-    - resource: true
-      file: "/crawler-default.yaml"
-      override: false
-
-    - resource: false
-      file: "crawler-conf.yaml"
-      override: true
-
-    - resource: false
-      file: "opensearch-conf.yaml"
-      override: true
-
-spouts:
-  - id: "filespout"
-    className: "org.apache.stormcrawler.spout.FileSpout"
-    parallelism: 1
-    constructorArgs:
-      - "."
-      - "seeds.txt"
-      - true
-
-bolts:
-  - id: "filter"
-    className: "org.apache.stormcrawler.bolt.URLFilterBolt"
-    parallelism: 1
-
-  - id: "status"
-    className: "org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt"
-    parallelism: 1
-
-streams:
-  - from: "filespout"
-    to: "filter"
-    grouping:
-      type: FIELDS
-      args: ["url"]
-      streamId: "status"
-
-  - from: "filter"
-    to: "status"
-    grouping:
-      streamId: "status"
-      type: CUSTOM
-      customClass:
-        className: "org.apache.stormcrawler.util.URLStreamGrouping"
-        constructorArgs:
-          - "byDomain"
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml b/external/opensearch-java/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml
deleted file mode 100644
index 25d6e4dba..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml
+++ /dev/null
@@ -1,115 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# configuration for OpenSearch resources
-
-config:
-
-  # address to use unless a more specific one has been
-  # defined for a component
-  # also accepts a list or multiple values in a single line
-  # separated by a semi-colon e.g. "opensearch1:9200; opensearch2:9200"
-  opensearch.addresses: "http://localhost:9200"
-  #opensearch.user: "USERNAME"
-  #opensearch.password: "PASSWORD"
-  opensearch.concurrentRequests: 2
-
-  # Disable TLS validation for connection to OpenSearch
-  # opensearch.disable.tls.validation: false
-
-  # Indexer bolt
-  # addresses can be specified as a full URL
-  # if not we assume that the protocol is http and the port 9200
-  opensearch.indexer.addresses: "localhost"
-  opensearch.indexer.index.name: "content"
-  # opensearch.indexer.pipeline: "_PIPELINE_"
-  opensearch.indexer.create: false
-  opensearch.indexer.bulkActions: 100
-  opensearch.indexer.flushInterval: "2s"
-  opensearch.indexer.concurrentRequests: 1
-  opensearch.indexer.sniff: true
-
-  # MetricsConsumer
-  # opensearch.metrics.addresses: "http://localhost:9200"
-  opensearch.metrics.index.name: "metrics"
-  opensearch.metrics.sniff: true
-
-  # Spout and persistence bolt
-  opensearch.status.addresses: "http://localhost:9200"
-  opensearch.status.index.name: "status"
-  #opensearch.status.user: "USERNAME"
-  #opensearch.status.password: "PASSWORD"
-  # the routing is done on the value of 'partition.url.mode'
-  opensearch.status.routing: true
-  # stores the value used for grouping the URLs as a separate field
-  # needed by the spout implementations
-  # also used for routing if the value above is set to true
-  opensearch.status.routing.fieldname: "key"
-  opensearch.status.bulkActions: 500
-  opensearch.status.flushInterval: "5s"
-  opensearch.status.concurrentRequests: 1
-  opensearch.status.sniff: true
-
-    # spout config #
-
-  # positive or negative filters parsable by the Lucene Query Parser
-  # opensearch.status.filterQuery:
-  #  - "-(key:stormcrawler.net)"
-  #  - "-(key:stormcrawler.apache.org)"
-
-  # time in secs for which the URLs will be considered for fetching after a ack of fail
-  spout.ttl.purgatory: 30
-
-  # Min time (in msecs) to allow between 2 successive queries to OpenSearch
-  spout.min.delay.queries: 2000
-
-  # Max time (in msecs) to allow between 2 successive queries to OpenSearch
-  spout.max.delay.queries: 20000
-
-  # Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time
-  # Setting this to -1 or a large value means that OpenSearch will cache the results but also that fewer and fewer
-  # results might be returned.
-  spout.reset.fetchdate.after: 120
-
-  opensearch.status.max.buckets: 50
-  opensearch.status.max.urls.per.bucket: 2
-  # field to group the URLs into buckets
-  opensearch.status.bucket.field: "key"
-  # fields to sort the URLs within a bucket
-  opensearch.status.bucket.sort.field:
-   - "nextFetchDate"
-   - "url"
-  # field to sort the buckets
-  opensearch.status.global.sort.field: "nextFetchDate"
-
-  # AggregationSpout : sampling improves the performance on large crawls
-  opensearch.status.sample: false
-
-  # max allowed duration of a query in sec
-  opensearch.status.query.timeout: -1
-
-  # AggregationSpout (expert): adds this value in mins to the latest date returned in the results and
-  # use it as nextFetchDate
-  opensearch.status.recentDate.increase: -1
-  opensearch.status.recentDate.min.gap: -1
-
-  topology.metrics.consumer.register:
-       - class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer"
-         parallelism.hint: 1
-         #whitelist:
-         #  - "fetcher_counter"
-         #  - "fetcher_average.bytes_fetched"
-         #blacklist:
-         #  - "__receive.*"
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/pom.xml b/external/opensearch-java/archetype/src/main/resources/archetype-resources/pom.xml
deleted file mode 100644
index cdfb7204f..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/pom.xml
+++ /dev/null
@@ -1,149 +0,0 @@
-
-
-
-
-
-
-    4.0.0
-    ${groupId}
-    ${artifactId}
-    ${version}
-    jar
-
-    ${artifactId}
-
-    
-        UTF-8
-        ${StormCrawlerVersion}
-        2.8.5
-    
-
-    
-        
-            
-                org.apache.maven.plugins
-                maven-compiler-plugin
-                3.11.0
-                
-                    17
-                    17
-                
-            
-            
-                org.codehaus.mojo
-                exec-maven-plugin
-                3.1.0
-                
-                    
-                        
-                            exec
-                        
-                    
-                
-                
-                    java
-                    true
-                    false
-                    compile
-                
-            
-            
-                org.apache.maven.plugins
-                maven-shade-plugin
-                3.5.0
-                
-                    
-                        package
-                        
-                            shade
-                        
-                        
-                            false
-                            
-                                
-                                
-                                    org.apache.storm.flux.Flux
-                                    
-                                        
-                                        
-                                    
-                                
-                            
-                            
-                            
-                                
-                                    *:*
-                                    
-                                        META-INF/*.SF
-                                        META-INF/*.DSA
-                                        META-INF/*.RSA
-                                    
-                                
-                                
-                                    
-                                    org.apache.storm:flux-core
-                                    
-                                        org/apache/commons/**
-                                        org/apache/http/**
-                                        org/yaml/**
-                                    
-                                
-                            
-                        
-                    
-                
-            
-        
-    
-
-    
-        
-            org.apache.stormcrawler
-            stormcrawler-core
-            ${stormcrawler.version}
-        
-        
-            org.apache.stormcrawler
-            stormcrawler-opensearch-java
-            ${stormcrawler.version}
-        
-        
-            org.apache.storm
-            storm-client
-            ${storm.version}
-            provided
-        
-        
-            org.apache.storm
-            flux-core
-            ${storm.version}
-        
-        
-            org.apache.stormcrawler
-            stormcrawler-tika
-            ${stormcrawler.version}
-        
-    
-
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt
deleted file mode 100644
index 389ef587b..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-# skip file: ftp: and mailto: urls
--^(file|ftp|mailto):
-
-# skip image and other suffixes we can't parse or are not likely to be relevant
-# if you want to crawl images or videos or archives then you should comment out this line
--(?i)\.(apk|deb|cab|iso|gif|jpg|png|svg|ico|css|sit|eps|wmf|rar|tar|jar|zip|gz|bz2|rpm|tgz|mov|exe|jpeg|jpe|bmp|js|mpg|mp3|mp4|m4a|ogv|kml|wmv|swf|flv|mkv|m4v|webm|ra|wma|wav|avi|xspf|m3u)(\?|&|$)
-
-# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
-# very time-consuming : use BasicURLFilter instead
-# -.*(/[^/]+)/[^/]+\1/[^/]+\1/
-
-# exclude localhost and equivalents to avoid that information
-# can be leaked by placing faked links pointing to web interfaces
-# of services running on the crawling machine (e.g., Elasticsearch,
-# Storm)
-#
-# - exclude localhost and loop-back addresses
-#     http://localhost:8080
-#     http://127.0.0.1/ .. http://127.255.255.255/
-#     http://[::1]/
--^https?://(?:localhost|127(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){3}|\[::1\])(?::\d+)?(?:/|$)
-#
-# - exclude private IP address spaces
-#     10.0.0.0/8
--^https?://(?:10(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){3})(?::\d+)?(?:/|$)
-#     192.168.0.0/16
--^https?://(?:192\.168(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){2})(?::\d+)?(?:/|$)
-#     172.16.0.0/12
--^https?://(?:172\.(?:1[6789]|2[0-9]|3[01])(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){2})(?::\d+)?(?:/|$)
-
-# accept anything else
-+.
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml
deleted file mode 100644
index accea7b5c..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml
+++ /dev/null
@@ -1,78 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-  
-  
-
-
-
-
-
-
-  
-  
-
-
-
-
-  
-  
-
-
-
-
-  
-  
-
-
-
-
-  
-  
-
-
-
-
-    
-    
-
-
-
-
-  
-  
-
-
-
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/indexer.mapping b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/indexer.mapping
deleted file mode 100644
index fc6eb887f..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/indexer.mapping
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-	"settings": {
-		"index": {
-			"number_of_shards": 5,
-			"number_of_replicas": 1,
-			"refresh_interval": "60s"
-		}
-	},
-	"mappings": {
-			"_source": {
-				"enabled": true
-			},
-			"properties": {
-				"content": {
-					"type": "text"
-				},
-				"description": {
-					"type": "text"
-				},
-				"domain": {
-					"type": "keyword"
-				},
-				"format": {
-					"type": "keyword"
-				},
-				"keywords": {
-					"type": "keyword"
-				},
-				"host": {
-					"type": "keyword"
-				},
-				"title": {
-					"type": "text"
-				},
-				"url": {
-					"type": "keyword"
-				}
-			}
-	}
-}
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json
deleted file mode 100644
index 4d87d8d5a..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-  "org.apache.stormcrawler.parse.JSoupFilters": [
-    {
-      "class": "org.apache.stormcrawler.jsoup.XPathFilter",
-      "name": "XPathFilter",
-      "params": {
-        "canonical": "//*[@rel=\"canonical\"]/@href",
-        "parse.description": [
-          "//*[@name=\"description\"]/@content",
-          "//*[@name=\"Description\"]/@content"
-        ],
-        "parse.title": [
-          "//TITLE/allText()",
-          "//META[@name=\"title\"]/@content"
-        ],
-        "parse.keywords": "//META[@name=\"keywords\"]/@content"
-      }
-    },
-    {
-      "class": "org.apache.stormcrawler.jsoup.LinkParseFilter",
-      "name": "LinkParseFilter",
-      "params": {
-        "pattern": "//FRAME/@src"
-      }
-    }
-  ]
-}
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/metrics.mapping b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/metrics.mapping
deleted file mode 100644
index fc6ae3a09..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/metrics.mapping
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-	  "index_patterns": "metrics*",
-	  "settings": {
-	    "index": {
-	      "number_of_shards": 1,
-	      "refresh_interval": "30s"
-	    },
-	    "number_of_replicas": 0
-	  },
-	  "mappings": {
-	      "_source":         { "enabled": true },
-	      "properties": {
-	          "name": {
-	            "type": "keyword"
-	          },
-	          "stormId": {
-	            "type": "keyword"
-	          },
-	          "srcComponentId": {
-	            "type": "keyword"
-	          },
-	          "srcTaskId": {
-	            "type": "short"
-	          },
-	          "srcWorkerHost": {
-	            "type": "keyword"
-	          },
-	          "srcWorkerPort": {
-	            "type": "integer"
-	          },
-	          "timestamp": {
-	            "type": "date",
-	            "format": "date_optional_time"
-	          },
-	          "value": {
-	            "type": "double"
-	          }
-	      }
-	  }
-}
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json
deleted file mode 100644
index 5d525830d..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "org.apache.stormcrawler.parse.ParseFilters": [
-    {
-      "class": "org.apache.stormcrawler.parse.filter.DomainParseFilter",
-      "name": "DomainParseFilter",
-      "params": {
-        "key": "domain",
-        "byHost": false
-       }
-    },
-    {
-      "class": "org.apache.stormcrawler.parse.filter.MimeTypeNormalization",
-      "name": "MimeTypeNormalization"
-    },
-    {
-      "class": "org.apache.stormcrawler.parse.filter.CommaSeparatedToMultivaluedMetadata",
-      "name": "CommaSeparatedToMultivaluedMetadata",
-      "params": {
-        "keys": ["parse.keywords"]
-       }
-    }
-  ]
-}
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/status.mapping b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/status.mapping
deleted file mode 100644
index e5b14fe97..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/status.mapping
+++ /dev/null
@@ -1,39 +0,0 @@
-{
-	"settings": {
-		"index": {
-			"number_of_shards": 10,
-			"number_of_replicas": 1,
-			"refresh_interval": "5s"
-		}
-	},
-	"mappings": {
-			"dynamic_templates": [{
-				"metadata": {
-					"path_match": "metadata.*",
-					"match_mapping_type": "string",
-					"mapping": {
-						"type": "keyword"
-					}
-				}
-			}],
-			"_source": {
-				"enabled": true
-			},
-			"properties": {
-				"key": {
-					"type": "keyword",
-					"index": true
-				},
-				"nextFetchDate": {
-					"type": "date",
-					"format": "date_optional_time"
-				},
-				"status": {
-					"type": "keyword"
-				},
-				"url": {
-					"type": "keyword"
-				}
-			}
-	}
-}
diff --git a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json b/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json
deleted file mode 100644
index 6098631bb..000000000
--- a/external/opensearch-java/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json
+++ /dev/null
@@ -1,60 +0,0 @@
-{
-	"org.apache.stormcrawler.filtering.URLFilters": [
-		{
-			"class": "org.apache.stormcrawler.filtering.basic.BasicURLFilter",
-			"name": "BasicURLFilter",
-			"params": {
-				"maxPathRepetition": 3,
-				"maxLength": 1024
-			}
-		},
-		{
-			"class": "org.apache.stormcrawler.filtering.depth.MaxDepthFilter",
-			"name": "MaxDepthFilter",
-			"params": {
-				"maxDepth": -1
-			}
-		},
-		{
-			"class": "org.apache.stormcrawler.filtering.basic.BasicURLNormalizer",
-			"name": "BasicURLNormalizer",
-			"params": {
-				"removeAnchorPart": true,
-				"unmangleQueryString": true,
-				"checkValidURI": true,
-				"removeHashes": true,
-				"hostIDNtoASCII": true
-			}
-		},
-		{
-			"class": "org.apache.stormcrawler.filtering.host.HostURLFilter",
-			"name": "HostURLFilter",
-			"params": {
-				"ignoreOutsideHost": false,
-				"ignoreOutsideDomain": true
-			}
-		},
-		{
-			"class": "org.apache.stormcrawler.filtering.regex.RegexURLNormalizer",
-			"name": "RegexURLNormalizer",
-			"params": {
-				"regexNormalizerFile": "default-regex-normalizers.xml"
-			}
-		},
-		{
-			"class": "org.apache.stormcrawler.filtering.regex.RegexURLFilter",
-			"name": "RegexURLFilter",
-			"params": {
-				"regexFilterFile": "default-regex-filters.txt"
-			}
-		},
-		{
-			"class": "org.apache.stormcrawler.filtering.basic.SelfURLFilter",
-			"name": "SelfURLFilter"
-		},
-		{
-			"class": "org.apache.stormcrawler.filtering.sitemap.SitemapFilter",
-			"name": "SitemapFilter"
-		}
-	]
-}
diff --git a/external/opensearch-java/dashboards/importDashboards.sh b/external/opensearch-java/dashboards/importDashboards.sh
deleted file mode 100755
index 561f739c1..000000000
--- a/external/opensearch-java/dashboards/importDashboards.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/bin/sh
-
-BIN=$(dirname $0)
-
-echo "Importing status dashboard into OpenSearch Dashboards"
-curl -X POST "localhost:5601/api/saved_objects/_import" -H "osd-xsrf: true" --form file=@$BIN/status.ndjson
-echo ""
-
-echo "Importing metrics dashboard into OpenSearch Dashboards"
-curl -X POST "localhost:5601/api/saved_objects/_import" -H "osd-xsrf: true" --form file=@$BIN/metrics.ndjson
-echo ""
-
-# Storm internal metrics
-# curl -X POST "localhost:5601/api/saved_objects/_import" -H "kbn-xsrf: true" --form file=@$BIN/storm.ndjson
diff --git a/external/opensearch-java/dashboards/metrics.ndjson b/external/opensearch-java/dashboards/metrics.ndjson
deleted file mode 100644
index 20cbb2bc0..000000000
--- a/external/opensearch-java/dashboards/metrics.ndjson
+++ /dev/null
@@ -1,10 +0,0 @@
-{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:activethreads\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : # active threads","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"2\"}}],\"listeners\":{},\"title\":\"Fetcher : # active threads\"}"},"id":"Fetcher-:-#-active-threads","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.178Z","version":"WzksMV0="}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:num_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : num queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : num queues\"}"},"id":"Fetcher-:-num-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.175Z","version":"WzgsMV0="}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : pages fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : pages fetched\"}"},"id":"Fetcher-:-pages-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.170Z","version":"WzcsMV0="}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:in_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : URLs waiting in queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"addLegend\":false,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"mode\":\"grouped\",\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"spyPerPage\":10,\"times\":[],\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"5\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"4\"}}],\"listeners\":{},\"title\":\"Fetcher : URLs waiting in queues\"}"},"id":"Fetcher-:-URLs-waiting-in-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.160Z","version":"WzUsMV0="}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.bytes_fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average bytes per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}}],\"listeners\":{},\"title\":\"Fetcher : average bytes per second\"}"},"id":"Fetcher-:-average-bytes-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.173Z","version":"WzYsMV0="}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average pages per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Fetcher : average pages per second\"}"},"id":"Fetcher-:-average-pages-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.820Z","version":"WzEwLDFd"}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.bytes_fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Total bytes fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"m\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Total bytes fetched\"}"},"id":"Total-bytes-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.823Z","version":"WzExLDFd"}
-{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}},\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelIndex\":\"1\",\"gridData\":{\"x\":24,\"y\":20,\"w\":12,\"h\":12,\"i\":\"1\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_0\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":12,\"y\":20,\"w\":12,\"h\":12,\"i\":\"2\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_1\"},{\"panelIndex\":\"3\",\"gridData\":{\"x\":0,\"y\":0,\"w\":36,\"h\":12,\"i\":\"3\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_2\"},{\"panelIndex\":\"4\",\"gridData\":{\"x\":0,\"y\":20,\"w\":12,\"h\":12,\"i\":\"4\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_3\"},{\"panelIndex\":\"5\",\"gridData\":{\"x\":0,\"y\":40,\"w\":36,\"h\":8,\"i\":\"5\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_4\"},{\"panelIndex\":\"6\",\"gridData\":{\"x\":0,\"y\":32,\"w\":36,\"h\":8,\"i\":\"6\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_5\"},{\"panelIndex\":\"7\",\"gridData\":{\"x\":0,\"y\":12,\"w\":36,\"h\":8,\"i\":\"7\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_6\"}]","timeRestore":false,"title":"Crawl metrics","version":1},"id":"Crawl-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Fetcher-:-#-active-threads","name":"panel_0","type":"visualization"},{"id":"Fetcher-:-num-queues","name":"panel_1","type":"visualization"},{"id":"Fetcher-:-pages-fetched","name":"panel_2","type":"visualization"},{"id":"Fetcher-:-URLs-waiting-in-queues","name":"panel_3","type":"visualization"},{"id":"Fetcher-:-average-bytes-per-second","name":"panel_4","type":"visualization"},{"id":"Fetcher-:-average-pages-per-second","name":"panel_5","type":"visualization"},{"id":"Total-bytes-fetched","name":"panel_6","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:06:58.830Z","version":"WzQsMV0="}
-{"exportedCount":9,"missingRefCount":0,"missingReferences":[]}
diff --git a/external/opensearch-java/dashboards/status.ndjson b/external/opensearch-java/dashboards/status.ndjson
deleted file mode 100644
index b3d0122e4..000000000
--- a/external/opensearch-java/dashboards/status.ndjson
+++ /dev/null
@@ -1,5 +0,0 @@
-{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"key\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"metadata._redirTo\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.depth\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Ecause\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Esource\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.fetch%2Eerror%2Ecount\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isFeed\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isSitemap\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.url%2Epath\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"nextFetchDate\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"status\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"url\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":true,\"readFromDocValues\":true}]","title":"status"},"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:07:47.130Z","version":"WzEzLDFd"}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"status count","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"status\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"status count\"}"},"id":"status-count","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.278Z","version":"WzE1LDFd"}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Top Hosts","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"key\",\"size\":50,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"Top Hosts\"}"},"id":"Top-Hosts","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.281Z","version":"WzE2LDFd"}
-{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"37874bbf-6607-435a-a231-94d81e9193e7\",\"gridData\":{\"x\":0,\"y\":0,\"w\":16,\"h\":20,\"i\":\"37874bbf-6607-435a-a231-94d81e9193e7\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"4faa5b74-1660-44f7-9227-89d900c8231e\",\"gridData\":{\"x\":16,\"y\":0,\"w\":16,\"h\":20,\"i\":\"4faa5b74-1660-44f7-9227-89d900c8231e\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Crawl status","version":1},"id":"Crawl-status","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"status-count","name":"panel_0","type":"visualization"},{"id":"Top-Hosts","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:07:47.948Z","version":"WzE0LDFd"}
-{"exportedCount":4,"missingRefCount":0,"missingReferences":[]}
diff --git a/external/opensearch-java/dashboards/storm.ndjson b/external/opensearch-java/dashboards/storm.ndjson
deleted file mode 100644
index 1d25d1f6e..000000000
--- a/external/opensearch-java/dashboards/storm.ndjson
+++ /dev/null
@@ -1,5 +0,0 @@
-{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name: \\\"__receive.population\\\"\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Storm Receive Queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcTaskId\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcComponentId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}"},"id":"Storm-Receive-Queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.875Z","version":"WzIwLDFd"}
-{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"+srcComponentId: \\\"__system\\\" +name: memory\\\\/heap*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Memory Heap","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":true,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"name\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"split\",\"params\":{\"field\":\"srcWorkerHost\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\",\"row\":true}}],\"listeners\":{}}"},"id":"Memory-Heap","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.877Z","version":"WzIxLDFd"}
-{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\",\"gridData\":{\"x\":0,\"y\":0,\"w\":32,\"h\":8,\"i\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\",\"gridData\":{\"x\":0,\"y\":8,\"w\":32,\"h\":16,\"i\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Storm metrics","version":1},"id":"Storm-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Storm-Receive-Queues","name":"panel_0","type":"visualization"},{"id":"Memory-Heap","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:08:33.810Z","version":"WzE5LDFd"}
-{"exportedCount":4,"missingRefCount":0,"missingReferences":[]}
diff --git a/external/opensearch-java/opensearch-conf.yaml b/external/opensearch-java/opensearch-conf.yaml
deleted file mode 100644
index d1d817deb..000000000
--- a/external/opensearch-java/opensearch-conf.yaml
+++ /dev/null
@@ -1,128 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# configuration for OpenSearch resources
-
-config:
-
-  # address to use unless a more specific one has been
-  # defined for a component
-  # also accepts a list or multiple values in a single line
-  # separated by a semi-colon e.g. "opensearch1:9200; opensearch2:9200"
-  opensearch.addresses: "http://localhost:9200"
-  #opensearch.user: "USERNAME"
-  #opensearch.password: "PASSWORD"
-  opensearch.concurrentRequests: 2
-
-  # Sets the response buffer to the specified value in MB.
-  # opensearch.responseBufferSize: 100
-
-  # Disable TLS validation for connection to OpenSearch
-  # opensearch.disable.tls.validation: false
-
-  # Indexer bolt
-  # addresses can be specified as a full URL
-  # if not we assume that the protocol is http and the port 9200
-  opensearch.indexer.addresses: "localhost"
-  opensearch.indexer.index.name: "content"
-  # opensearch.indexer.pipeline: "_PIPELINE_"
-  opensearch.indexer.create: false
-  opensearch.indexer.bulkActions: 100
-  opensearch.indexer.flushInterval: "2s"
-  opensearch.indexer.concurrentRequests: 1
-  opensearch.indexer.sniff: true
-  # Sets the response buffer to the specified value in MB.
-  # opensearch.indexer.responseBufferSize: 100
-
-  # MetricsConsumer
-  # opensearch.metrics.addresses: "http://localhost:9200"
-  opensearch.metrics.index.name: "metrics"
-  opensearch.metrics.sniff: true
-  # Sets the response buffer to the specified value in MB.
-  # opensearch.metrics.responseBufferSize: 100
-
-  # Spout and persistence bolt
-  opensearch.status.addresses: "http://localhost:9200"
-  opensearch.status.index.name: "status"
-  #opensearch.status.user: "USERNAME"
-  #opensearch.status.password: "PASSWORD"
-  # the routing is done on the value of 'partition.url.mode'
-  opensearch.status.routing: true
-  # stores the value used for grouping the URLs as a separate field
-  # needed by the spout implementations
-  # also used for routing if the value above is set to true
-  opensearch.status.routing.fieldname: "key"
-  opensearch.status.bulkActions: 500
-  opensearch.status.flushInterval: "5s"
-  opensearch.status.concurrentRequests: 1
-  opensearch.status.sniff: true
-  # Sets the response buffer to the specified value in MB.
-  # opensearch.status.responseBufferSize: 100
-
-    # spout config #
-
-  # positive or negative filters parsable by the Lucene Query Parser
-  # opensearch.status.filterQuery:
-  #  - "-(key:stormcrawler.net)"
-  #  - "-(key:apache.stormcrawler.org)"
-
-  # time in secs for which the URLs will be considered for fetching after a ack of fail
-  spout.ttl.purgatory: 30
-
-  # Min time (in msecs) to allow between 2 successive queries to OpenSearch
-  spout.min.delay.queries: 2000
-
-  # Max time (in msecs) to allow between 2 successive queries to OpenSearch
-  spout.max.delay.queries: 20000
-
-  # Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time
-  # Setting this to -1 or a large value means that OpenSearch will cache the results but also that fewer and fewer
-  # results might be returned.
-  spout.reset.fetchdate.after: 120
-
-  opensearch.status.max.buckets: 50
-  opensearch.status.max.urls.per.bucket: 2
-  # field to group the URLs into buckets
-  opensearch.status.bucket.field: "key"
-  # fields to sort the URLs within a bucket
-  opensearch.status.bucket.sort.field:
-   - "nextFetchDate"
-   - "url"
-  # field to sort the buckets
-  opensearch.status.global.sort.field: "nextFetchDate"
-
-  # AggregationSpout : sampling improves the performance on large crawls
-  opensearch.status.sample: false
-
-  # max allowed duration of a query in sec
-  opensearch.status.query.timeout: -1
-
-  # AggregationSpout (expert): adds this value in mins to the latest date returned in the results and
-  # use it as nextFetchDate
-  opensearch.status.recentDate.increase: -1
-  opensearch.status.recentDate.min.gap: -1
-
-  # Caffeine cache specification for the waitAck cache used in StatusUpdaterBolt.
-  # If not set, the value of topology.message.timeout.secs is used for expireAfterWrite (default: 300s)
-  # opensearch.status.waitack.cache.spec: "maximumSize=10000,expireAfterWrite=300s"
-
-  topology.metrics.consumer.register:
-       - class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer"
-         parallelism.hint: 1
-         #whitelist:
-         #  - "fetcher_counter"
-         #  - "fetcher_average.bytes_fetched"
-         #blacklist:
-         #  - "__receive.*"
diff --git a/external/opensearch-java/pom.xml b/external/opensearch-java/pom.xml
index c7dc1e25d..56a73169d 100644
--- a/external/opensearch-java/pom.xml
+++ b/external/opensearch-java/pom.xml
@@ -30,9 +30,8 @@ under the License.
     
 
     
-        2.19.5
-        2.13.0
-        2.12.0
+        3.5.0
+        3.8.0
         true
         0.27
         0.27
@@ -65,7 +64,7 @@ under the License.
                 
                 
                     
-                        ${opensearch.version}
+                        ${opensearch.server.version}
                     
                 
             
@@ -79,15 +78,6 @@ under the License.
             ${opensearch.java.version}
         
 
-        
-        
-            org.opensearch.client
-            opensearch-rest-client-sniffer
-            ${opensearch.restclient.version}
-        
-
         
             org.apache.stormcrawler
             stormcrawler-core
diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/BulkItemResponseToFailedFlag.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/BulkItemResponseToFailedFlag.java
index 0a064f0e9..c32f162f2 100644
--- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/BulkItemResponseToFailedFlag.java
+++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/BulkItemResponseToFailedFlag.java
@@ -23,15 +23,28 @@
 import org.opensearch.client.opensearch._types.ErrorCause;
 import org.opensearch.client.opensearch.core.bulk.BulkResponseItem;
 
-public final class BulkItemResponseToFailedFlag {
-    @NotNull public final BulkResponseItem response;
-    public final boolean failed;
-    @NotNull public final String id;
+/**
+ * Wraps a {@link BulkResponseItem} with a pre-computed failure flag. A 409 (conflict) is not
+ * considered a failure — it simply indicates a document already existed when using create mode.
+ *
+ * @param response the original bulk response item
+ * @param failed whether this item represents a real failure (excludes 409 conflicts)
+ * @param id the document id from the response item
+ */
+public record BulkItemResponseToFailedFlag(
+        @NotNull BulkResponseItem response, boolean failed, @NotNull String id) {
+
+    public BulkItemResponseToFailedFlag {
+        Objects.requireNonNull(response, "response");
+        Objects.requireNonNull(id, "id");
+    }
 
+    /** Constructs with id derived from the response item. */
     public BulkItemResponseToFailedFlag(@NotNull BulkResponseItem response, boolean failed) {
-        this.response = response;
-        this.failed = failed;
-        this.id = Objects.requireNonNull(response.id(), "BulkResponseItem id must not be null");
+        this(
+                response,
+                failed,
+                Objects.requireNonNull(response.id(), "BulkResponseItem id must not be null"));
     }
 
     /** Returns the error cause, or {@code null} if the item did not fail. */
@@ -50,48 +63,9 @@ public String getFailure() {
         return error.reason() != null ? error.reason() : error.type();
     }
 
-    public Integer getStatus() {
+    // opensearch-java: status() returns int HTTP code, not RestStatus enum
+    /** Returns the HTTP status code of this response item. */
+    public int getStatus() {
         return response.status();
     }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) {
-            return true;
-        }
-        if (!(o instanceof BulkItemResponseToFailedFlag)) {
-            return false;
-        }
-
-        BulkItemResponseToFailedFlag that = (BulkItemResponseToFailedFlag) o;
-
-        if (failed != that.failed) {
-            return false;
-        }
-        if (!response.equals(that.response)) {
-            return false;
-        }
-        return id.equals(that.id);
-    }
-
-    @Override
-    public int hashCode() {
-        int result = response.hashCode();
-        result = 31 * result + (failed ? 1 : 0);
-        result = 31 * result + id.hashCode();
-        return result;
-    }
-
-    @Override
-    public String toString() {
-        return "BulkItemResponseToFailedFlag{"
-                + "response="
-                + response
-                + ", failed="
-                + failed
-                + ", id='"
-                + id
-                + '\''
-                + '}';
-    }
 }
diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/DelegateRefresher.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/DelegateRefresher.java
new file mode 100644
index 000000000..fb03a9f0f
--- /dev/null
+++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/DelegateRefresher.java
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stormcrawler.opensearch;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Map;
+import java.util.Timer;
+import java.util.TimerTask;
+import org.apache.stormcrawler.JSONResource;
+import org.opensearch.client.json.JsonData;
+import org.opensearch.client.opensearch.OpenSearchClient;
+import org.opensearch.client.opensearch.core.GetResponse;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Loads a delegate class that implements both a required base type and {@link JSONResource}, then
+ * periodically refreshes its configuration from OpenSearch. Used by {@link
+ * org.apache.stormcrawler.opensearch.filtering.JSONURLFilterWrapper} and {@link
+ * org.apache.stormcrawler.opensearch.parse.filter.JSONResourceWrapper} to eliminate duplicated
+ * setup/refresh/cleanup logic.
+ *
+ * 

This is the opensearch-java (OpenSearch Java Client 3.x / HC5) counterpart of the class with + * the same name in the {@code external/opensearch} module. It uses the typed {@link + * OpenSearchClient} instead of the deprecated {@code RestHighLevelClient}. + * + * @param the base type that the delegate must extend (e.g. URLFilter or ParseFilter) + */ +public class DelegateRefresher { + + private static final Logger LOG = LoggerFactory.getLogger(DelegateRefresher.class); + + private final T delegate; + private Timer refreshTimer; + private OpenSearchClient osClient; + + /** + * Creates a refresher by loading the delegate class from the JSON configuration. + * + * @param baseType the required base class (e.g. URLFilter.class or ParseFilter.class) + * @param stormConf the Storm configuration map + * @param filterParams the JSON params node containing "delegate" and optional "refresh" + * @param configurer callback to configure the delegate after instantiation + */ + public DelegateRefresher( + Class baseType, + Map stormConf, + JsonNode filterParams, + DelegateConfigure configurer) { + + JsonNode delegateNode = filterParams.get("delegate"); + if (delegateNode == null) { + throw new RuntimeException("delegateNode undefined!"); + } + + String delegateClassName = null; + JsonNode node = delegateNode.get("class"); + if (node != null && node.isTextual()) { + delegateClassName = node.asText(); + } + if (delegateClassName == null) { + throw new RuntimeException(baseType.getSimpleName() + " delegate class undefined!"); + } + + try { + Class filterClass = Class.forName(delegateClassName); + + if (!baseType.isAssignableFrom(filterClass)) { + throw new RuntimeException( + "Filter " + delegateClassName + " does not extend " + baseType.getName()); + } + + @SuppressWarnings("unchecked") + T instance = (T) filterClass.getDeclaredConstructor().newInstance(); + + if (!(instance instanceof JSONResource)) { + throw new RuntimeException( + "Filter " + delegateClassName + " does not implement JSONResource"); + } + + this.delegate = instance; + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + LOG.error("Can't setup {}: {}", delegateClassName, e); + throw new RuntimeException("Can't setup " + delegateClassName, e); + } + + // configure the delegate + JsonNode paramsNode = delegateNode.get("params"); + configurer.configure(delegate, stormConf, paramsNode); + + // set up periodic refresh from OpenSearch + int refreshRate = 600; + node = filterParams.get("refresh"); + if (node != null && (node.isInt() || node.isTextual())) { + refreshRate = node.asInt(refreshRate); + } + + final JSONResource resource = (JSONResource) delegate; + + refreshTimer = new Timer(); + refreshTimer.schedule( + new TimerTask() { + public void run() { + if (osClient == null) { + try { + osClient = OpenSearchConnection.getClient(stormConf, "config"); + } catch (Exception e) { + LOG.error("Exception while creating OpenSearch connection", e); + } + } + if (osClient != null) { + LOG.info("Reloading json resources from OpenSearch"); + try { + GetResponse response = + osClient.get( + g -> + g.index("config") + .id(resource.getResourceFile()), + JsonData.class); + if (response.found() && response.source() != null) { + String json = response.source().toJson().toString(); + resource.loadJSONResources( + new ByteArrayInputStream( + json.getBytes(StandardCharsets.UTF_8))); + } + } catch (Exception e) { + LOG.error("Can't load config from OpenSearch", e); + } + } + } + }, + refreshRate * 1000L, + refreshRate * 1000L); + } + + /** Returns the delegate instance. */ + public T getDelegate() { + return delegate; + } + + /** Cancels the refresh timer and closes the OpenSearch client. */ + public void cleanup() { + if (refreshTimer != null) { + refreshTimer.cancel(); + } + if (osClient != null) { + try { + osClient._transport().close(); + } catch (IOException e) { + LOG.error("Exception when closing OpenSearch client", e); + } + osClient = null; + } + } + + /** Callback interface for configuring the delegate after instantiation. */ + @FunctionalInterface + public interface DelegateConfigure { + void configure(T delegate, Map stormConf, JsonNode params); + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/IndexCreation.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/IndexCreation.java index ed44644c1..f76172057 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/IndexCreation.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/IndexCreation.java @@ -21,12 +21,10 @@ import java.io.IOException; import java.net.URL; import java.nio.charset.StandardCharsets; -import org.opensearch.client.Request; -import org.opensearch.client.Response; -import org.opensearch.client.RestClient; import org.opensearch.client.opensearch.OpenSearchClient; +import org.opensearch.client.opensearch.generic.Requests; +import org.opensearch.client.opensearch.generic.Response; import org.opensearch.client.opensearch.indices.ExistsTemplateRequest; -import org.opensearch.client.transport.rest_client.RestClientTransport; import org.slf4j.Logger; public class IndexCreation { @@ -72,14 +70,17 @@ private static boolean createTemplate( final String jsonIndexConfiguration = Resources.toString(mapping, StandardCharsets.UTF_8); - // Extract the low-level REST client to bypass typed builder limitations for raw JSON - RestClient restClient = ((RestClientTransport) client._transport()).restClient(); - Request request = new Request("PUT", "/_template/" + templateName); - request.setJsonEntity(jsonIndexConfiguration); - - Response response = restClient.performRequest(request); - int statusCode = response.getStatusLine().getStatusCode(); - return statusCode == 200 || statusCode == 201; + try (Response response = + client.generic() + .execute( + Requests.builder() + .endpoint("/_template/" + templateName) + .method("PUT") + .json(jsonIndexConfiguration) + .build())) { + int statusCode = response.getStatus(); + return statusCode == 200 || statusCode == 201; + } } catch (Exception e) { log.warn("template '{}' not created", templateName, e); return false; @@ -96,14 +97,17 @@ private static boolean createIndex( final String jsonIndexConfiguration = Resources.toString(mapping, StandardCharsets.UTF_8); - // Extract the low-level REST client to bypass typed builder limitations for raw JSON - RestClient restClient = ((RestClientTransport) client._transport()).restClient(); - Request request = new Request("PUT", "/" + indexName); - request.setJsonEntity(jsonIndexConfiguration); - - Response response = restClient.performRequest(request); - int statusCode = response.getStatusLine().getStatusCode(); - return statusCode == 200 || statusCode == 201; + try (Response response = + client.generic() + .execute( + Requests.builder() + .endpoint("/" + indexName) + .method("PUT") + .json(jsonIndexConfiguration) + .build())) { + int statusCode = response.getStatus(); + return statusCode == 200 || statusCode == 201; + } } catch (Exception e) { log.warn("index '{}' not created", indexName, e); return false; diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java index deb96c841..0d8675398 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java @@ -17,8 +17,8 @@ package org.apache.stormcrawler.opensearch; -import static org.opensearch.client.RestClientBuilder.DEFAULT_CONNECT_TIMEOUT_MILLIS; -import static org.opensearch.client.RestClientBuilder.DEFAULT_SOCKET_TIMEOUT_MILLIS; +import static org.opensearch.client.transport.httpclient5.ApacheHttpClient5TransportBuilder.DEFAULT_CONNECT_TIMEOUT_MILLIS; +import static org.opensearch.client.transport.httpclient5.ApacheHttpClient5TransportBuilder.DEFAULT_RESPONSE_TIMEOUT_MILLIS; import java.io.IOException; import java.net.URI; @@ -28,29 +28,26 @@ import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; +import javax.net.ssl.SSLContext; import org.apache.commons.lang3.StringUtils; -import org.apache.http.HttpHost; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.CredentialsProvider; -import org.apache.http.conn.ssl.NoopHostnameVerifier; -import org.apache.http.conn.ssl.TrustAllStrategy; -import org.apache.http.impl.client.BasicCredentialsProvider; -import org.apache.http.ssl.SSLContextBuilder; +import org.apache.hc.client5.http.auth.AuthScope; +import org.apache.hc.client5.http.auth.UsernamePasswordCredentials; +import org.apache.hc.client5.http.config.ConnectionConfig; +import org.apache.hc.client5.http.impl.auth.BasicCredentialsProvider; +import org.apache.hc.client5.http.impl.nio.PoolingAsyncClientConnectionManagerBuilder; +import org.apache.hc.client5.http.ssl.ClientTlsStrategyBuilder; +import org.apache.hc.client5.http.ssl.NoopHostnameVerifier; +import org.apache.hc.core5.http.HttpHost; +import org.apache.hc.core5.reactor.ssl.TlsDetails; +import org.apache.hc.core5.ssl.SSLContextBuilder; +import org.apache.hc.core5.util.Timeout; import org.apache.stormcrawler.util.ConfUtils; import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; -import org.opensearch.client.HttpAsyncResponseConsumerFactory; -import org.opensearch.client.Node; -import org.opensearch.client.RequestOptions; -import org.opensearch.client.RestClient; -import org.opensearch.client.RestClientBuilder; import org.opensearch.client.json.jackson.JacksonJsonpMapper; import org.opensearch.client.opensearch.OpenSearchClient; import org.opensearch.client.opensearch.core.bulk.BulkOperation; -import org.opensearch.client.sniff.Sniffer; -import org.opensearch.client.transport.rest_client.RestClientOptions; -import org.opensearch.client.transport.rest_client.RestClientTransport; +import org.opensearch.client.transport.OpenSearchTransport; +import org.opensearch.client.transport.httpclient5.ApacheHttpClient5TransportBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -65,19 +62,15 @@ public final class OpenSearchConnection { @NotNull private final AsyncBulkProcessor processor; - @Nullable private final Sniffer sniffer; - - @NotNull private final RestClient restClient; + @NotNull private final OpenSearchTransport transport; private OpenSearchConnection( @NotNull OpenSearchClient c, @NotNull AsyncBulkProcessor p, - @Nullable Sniffer s, - @NotNull RestClient rc) { + @NotNull OpenSearchTransport t) { client = c; processor = p; - sniffer = s; - restClient = rc; + transport = t; } public OpenSearchClient getClient() { @@ -90,7 +83,7 @@ public OpenSearchClient getClient() { * client's transport via {@code client._transport().close()}. */ public static OpenSearchClient getClient(Map stormConf, String boltType) { - return buildClientResources(stormConf, boltType, 100).client(); + return buildClientResources(stormConf, boltType).client(); } /** Adds a single bulk operation to the internal processor. */ @@ -130,11 +123,7 @@ public static OpenSearchConnection getConnection( final String dottedType = boltType + "."; - final int bufferSize = - ConfUtils.getInt( - stormConf, Constants.PARAMPREFIX, dottedType, "responseBufferSize", 100); - - ClientResources cr = buildClientResources(stormConf, boltType, bufferSize); + ClientResources cr = buildClientResources(stormConf, boltType); final String flushIntervalString = ConfUtils.getString( @@ -150,7 +139,6 @@ public static OpenSearchConnection getConnection( stormConf, Constants.PARAMPREFIX, dottedType, "concurrentRequests", 1); AsyncBulkProcessor bulkProcessor = null; - Sniffer sniffer = null; try { bulkProcessor = new AsyncBulkProcessor.Builder(cr.client(), listener) @@ -159,14 +147,7 @@ public static OpenSearchConnection getConnection( .setConcurrentRequests(concurrentRequests) .build(); - boolean sniff = - ConfUtils.getBoolean( - stormConf, Constants.PARAMPREFIX, dottedType, "sniff", true); - if (sniff) { - sniffer = Sniffer.builder(cr.restClient()).build(); - } - - return new OpenSearchConnection(cr.client(), bulkProcessor, sniffer, cr.restClient()); + return new OpenSearchConnection(cr.client(), bulkProcessor, cr.transport()); } catch (Exception e) { if (bulkProcessor != null) { try { @@ -176,7 +157,7 @@ public static OpenSearchConnection getConnection( } } try { - cr.restClient().close(); + cr.transport().close(); } catch (IOException suppressed) { e.addSuppressed(suppressed); } @@ -206,15 +187,11 @@ public void close() { throw new RuntimeException(e); } - if (sniffer != null) { - sniffer.close(); - } - - // Now close the REST client (also closes the transport) + // Now close the transport (also shuts down the underlying HTTP client) try { - restClient.close(); + transport.close(); } catch (IOException e) { - LOG.trace("Client threw IO exception."); + LOG.trace("Transport threw IO exception on close."); } } @@ -239,10 +216,10 @@ public static String getBulkOperationId(BulkOperation op) { } // internal helpers - private record ClientResources(OpenSearchClient client, RestClient restClient) {} + private record ClientResources(OpenSearchClient client, OpenSearchTransport transport) {} private static ClientResources buildClientResources( - Map stormConf, String boltType, int responseBufferSizeMB) { + Map stormConf, String boltType) { final String dottedType = boltType + "."; @@ -278,10 +255,15 @@ private static ClientResources buildClientResources( if (uri.getScheme() != null) { scheme = uri.getScheme(); } - hosts.add(new HttpHost(uri.getHost(), port, scheme)); + // HC5: constructor is (scheme, hostname, port) — not (hostname, port, scheme) + hosts.add(new HttpHost(scheme, uri.getHost(), port)); } - final RestClientBuilder builder = RestClient.builder(hosts.toArray(new HttpHost[0])); + LOG.info( + "OpenSearch {} transport configured with {} host(s): {}", + boltType, + hosts.size(), + hosts); // authentication via user / password final String user = @@ -306,28 +288,90 @@ private static ClientResources buildClientResources( final boolean needsUser = StringUtils.isNotBlank(user) && StringUtils.isNotBlank(password); final boolean needsProxy = StringUtils.isNotBlank(proxyhost) && proxyport != -1; + // Defaults from ApacheHttpClient5TransportBuilder (same as the former RestClientBuilder) + final int connectTimeout = + ConfUtils.getInt( + stormConf, + Constants.PARAMPREFIX, + dottedType, + "connect.timeout", + DEFAULT_CONNECT_TIMEOUT_MILLIS); + final int socketTimeout = + ConfUtils.getInt( + stormConf, + Constants.PARAMPREFIX, + dottedType, + "socket.timeout", + DEFAULT_RESPONSE_TIMEOUT_MILLIS); + + final boolean compression = + ConfUtils.getBoolean( + stormConf, Constants.PARAMPREFIX, dottedType, "compression", false); + + final ApacheHttpClient5TransportBuilder builder = + ApacheHttpClient5TransportBuilder.builder(hosts.toArray(new HttpHost[0])) + .setMapper(new JacksonJsonpMapper()); + + // Timeouts via ConnectionConfig on the builder's internal connection manager + builder.setConnectionConfigCallback( + connConfigBuilder -> + connConfigBuilder + .setConnectTimeout(Timeout.ofMilliseconds(connectTimeout)) + .setSocketTimeout(Timeout.ofMilliseconds(socketTimeout))); + + // Auth, proxy, and/or trust-all SSL via HttpClient customisation if (needsUser || needsProxy || disableTlsValidation) { builder.setHttpClientConfigCallback( httpClientBuilder -> { + // hc.client5 auth: password is char[], AuthScope(host, port) if (needsUser) { - final CredentialsProvider credentialsProvider = + final BasicCredentialsProvider credentialsProvider = new BasicCredentialsProvider(); credentialsProvider.setCredentials( - AuthScope.ANY, new UsernamePasswordCredentials(user, password)); + new AuthScope(null, -1), + new UsernamePasswordCredentials(user, password.toCharArray())); httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider); } + // hc.client5 proxy: HttpHost(scheme, host, port) if (needsProxy) { httpClientBuilder.setProxy( - new HttpHost(proxyhost, proxyport, proxyscheme)); + new HttpHost(proxyscheme, proxyhost, proxyport)); } - + // Custom connection manager overrides the builder's internal one, + // so timeouts and TlsDetailsFactory must be replicated here if (disableTlsValidation) { try { - final SSLContextBuilder sslContext = new SSLContextBuilder(); - sslContext.loadTrustMaterial(null, new TrustAllStrategy()); - httpClientBuilder.setSSLContext(sslContext.build()); - httpClientBuilder.setSSLHostnameVerifier( - NoopHostnameVerifier.INSTANCE); + final SSLContext sslContext = + SSLContextBuilder.create() + .loadTrustMaterial((chain, authType) -> true) + .build(); + httpClientBuilder.setConnectionManager( + PoolingAsyncClientConnectionManagerBuilder.create() + .setTlsStrategy( + ClientTlsStrategyBuilder.create() + .setSslContext(sslContext) + .setHostnameVerifier( + NoopHostnameVerifier + .INSTANCE) + // HTTP/2 ALPN negotiation + .setTlsDetailsFactory( + sslEngine -> + new TlsDetails( + sslEngine + .getSession(), + sslEngine + .getApplicationProtocol())) + .build()) + .setDefaultConnectionConfig( + ConnectionConfig.custom() + .setConnectTimeout( + Timeout.ofMilliseconds( + connectTimeout)) + .setSocketTimeout( + Timeout.ofMilliseconds( + socketTimeout)) + .build()) + .build()); } catch (Exception e) { throw new RuntimeException("Failed to disable TLS validation", e); } @@ -336,101 +380,13 @@ private static ClientResources buildClientResources( }); } - final int connectTimeout = - ConfUtils.getInt( - stormConf, - Constants.PARAMPREFIX, - dottedType, - "connect.timeout", - DEFAULT_CONNECT_TIMEOUT_MILLIS); - final int socketTimeout = - ConfUtils.getInt( - stormConf, - Constants.PARAMPREFIX, - dottedType, - "socket.timeout", - DEFAULT_SOCKET_TIMEOUT_MILLIS); - // timeout until connection is established - builder.setRequestConfigCallback( - requestConfigBuilder -> - requestConfigBuilder - .setConnectTimeout(connectTimeout) - // Timeout when waiting for data - .setSocketTimeout(socketTimeout)); - - // TODO check if this has gone somewhere else - // int maxRetryTimeout = ConfUtils.getInt(stormConf, Constants.PARAMPREFIX + - // boltType + - // ".max.retry.timeout", - // DEFAULT_MAX_RETRY_TIMEOUT_MILLIS); - // builder.setMaxRetryTimeoutMillis(maxRetryTimeout); - - // TODO configure headers etc... - // Map configSettings = (Map) stormConf - // .get(Constants.PARAMPREFIX + boltType + ".settings"); - // if (configSettings != null) { - // configSettings.forEach((k, v) -> settings.put(k, v)); - // } - - // use node selector only to log nodes listed in the config - // and/or discovered through sniffing - builder.setNodeSelector( - nodes -> { - for (Node node : nodes) { - LOG.debug( - "Connected to OpenSearch node {} [{}] for {}", - node.getName(), - node.getHost(), - boltType); - } - }); - - final boolean compression = - ConfUtils.getBoolean( - stormConf, Constants.PARAMPREFIX, dottedType, "compression", false); - + // Compression: first-class builder method, not a request interceptor builder.setCompressionEnabled(compression); - final RestClient restClient = builder.build(); - - // --- Response buffer size configuration --- - // The default HeapBufferedResponseConsumerFactory in the low-level REST client has - // a hardcoded limit of 100 MB. Large MSearch or aggregation responses can exceed - // this, causing ContentTooLongException. - // - // This fix works because we use RestClientTransport, which passes RequestOptions - // (including HttpAsyncResponseConsumerFactory) directly to the low-level RestClient. - // - // NOTE: if StormCrawler ever switches to ApacheHttpClient5Transport, this approach - // will silently stop working. In that case, use: - // ApacheHttpClient5Options.DEFAULT.toBuilder() - // .setHttpAsyncResponseConsumerFactory(factory).build() - // See: https://github.com/opensearch-project/opensearch-java/issues/1370 - final int DEFAULT_RESPONSE_BUFFER_SIZE_MB = 100; - final int effectiveBufferSizeMB; - if (responseBufferSizeMB <= 0) { - LOG.warn( - "Invalid responseBufferSize {}MB for {}, falling back to default {}MB", - responseBufferSizeMB, - boltType, - DEFAULT_RESPONSE_BUFFER_SIZE_MB); - effectiveBufferSizeMB = DEFAULT_RESPONSE_BUFFER_SIZE_MB; - } else { - effectiveBufferSizeMB = responseBufferSizeMB; - } - LOG.info("OpenSearch response buffer size for {}: {}MB", boltType, effectiveBufferSizeMB); - - final RequestOptions.Builder optionsBuilder = RequestOptions.DEFAULT.toBuilder(); - optionsBuilder.setHttpAsyncResponseConsumerFactory( - new HttpAsyncResponseConsumerFactory.HeapBufferedResponseConsumerFactory( - effectiveBufferSizeMB * 1024 * 1024)); - final RestClientOptions transportOptions = new RestClientOptions(optionsBuilder.build()); - - final RestClientTransport transport = - new RestClientTransport(restClient, new JacksonJsonpMapper(), transportOptions); + final OpenSearchTransport transport = builder.build(); final OpenSearchClient openSearchClient = new OpenSearchClient(transport); - return new ClientResources(openSearchClient, restClient); + return new ClientResources(openSearchClient, transport); } /** diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/WaitAckCache.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/WaitAckCache.java new file mode 100644 index 000000000..0e8574fbb --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/WaitAckCache.java @@ -0,0 +1,340 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.RemovalCause; +import com.github.benmanes.caffeine.cache.Ticker; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.ReentrantLock; +import java.util.function.Consumer; +import java.util.stream.Collectors; +import org.apache.storm.tuple.Tuple; +import org.apache.stormcrawler.metrics.ScopedCounter; +import org.jetbrains.annotations.Nullable; +// opensearch-java: uses typed BulkRequest/BulkResponse, not legacy REST equivalents +import org.opensearch.client.opensearch.core.BulkRequest; +import org.opensearch.client.opensearch.core.BulkResponse; +import org.slf4j.Logger; + +/** + * Thread-safe cache that tracks in-flight tuples awaiting bulk acknowledgment from OpenSearch. + * Provides shared logic for processing bulk responses and failing tuples on error, used by + * IndexerBolt, DeletionBolt, and StatusUpdaterBolt. + */ +public class WaitAckCache { + + /** Callback invoked for each tuple when processing a successful bulk response. */ + @FunctionalInterface + public interface TupleAction { + void handle(String id, Tuple tuple, BulkItemResponseToFailedFlag selected); + } + + private final Cache> cache; + private final ReentrantLock lock = new ReentrantLock(true); + private final Logger log; + private final Consumer onEviction; + + /** Creates a cache with a fixed 60-second expiry. */ + public WaitAckCache(Logger log, Consumer onEviction) { + this(Caffeine.newBuilder().expireAfterWrite(60, TimeUnit.SECONDS), log, onEviction); + } + + /** + * Creates a cache using a {@link Caffeine} spec string (e.g. {@code "expireAfterWrite=300s"}), + * typically driven by {@code topology.message.timeout.secs}. + */ + public WaitAckCache(String cacheSpec, Logger log, Consumer onEviction) { + this(Caffeine.from(cacheSpec), log, onEviction); + } + + /** Creates a cache with a custom ticker for deterministic time control in tests. */ + WaitAckCache(String cacheSpec, Logger log, Consumer onEviction, Ticker ticker) { + this(Caffeine.from(cacheSpec).ticker(ticker).executor(Runnable::run), log, onEviction); + } + + private WaitAckCache(Caffeine builder, Logger log, Consumer onEviction) { + this.log = log; + this.onEviction = onEviction; + this.cache = + builder.>removalListener( + (String key, List value, RemovalCause cause) -> { + if (!cause.wasEvicted()) { + return; + } + if (value != null) { + log.error( + "Purged from waitAck {} with {} values", + key, + value.size()); + for (Tuple t : value) { + onEviction.accept(t); + } + } else { + log.error("Purged from waitAck {} with no values", key); + } + }) + .build(); + } + + /** Returns the approximate number of entries in this cache. */ + public long estimatedSize() { + return cache.estimatedSize(); + } + + /** Adds a tuple to the cache under the given document ID, creating the list if needed. */ + public void addTuple(String docID, Tuple tuple) { + lock.lock(); + try { + List tt = cache.get(docID, k -> new LinkedList<>()); + tt.add(tuple); + if (log.isDebugEnabled()) { + String url = (String) tuple.getValueByField("url"); + log.debug("Added to waitAck {} with ID {} total {}", url, docID, tt.size()); + } + } finally { + lock.unlock(); + } + } + + /** Returns true if the cache contains an entry for the given document ID. */ + public boolean contains(String docID) { + lock.lock(); + try { + return cache.getIfPresent(docID) != null; + } finally { + lock.unlock(); + } + } + + /** Forces pending cache maintenance, triggering eviction listeners for expired entries. */ + public void cleanUp() { + cache.cleanUp(); + } + + /** Fails all remaining tuples in the cache and invalidates all entries. */ + public void shutdown() { + lock.lock(); + try { + Map> remaining = cache.asMap(); + for (var entry : remaining.entrySet()) { + log.warn( + "Shutdown: failing {} tuple(s) for ID {}", + entry.getValue().size(), + entry.getKey()); + for (Tuple t : entry.getValue()) { + onEviction.accept(t); + } + } + cache.invalidateAll(); + } finally { + lock.unlock(); + } + } + + /** Invalidates a single cache entry. */ + public void invalidate(String docID) { + lock.lock(); + try { + cache.invalidate(docID); + } finally { + lock.unlock(); + } + } + + /** + * Processes a successful bulk response: classifies each item (conflict vs failure), retrieves + * cached tuples, selects the best response per document ID, and invokes the action for each + * tuple. + * + * @param conflictCounter optional metric counter; if non-null, increments "doc_conflicts" scope + * for each conflict + */ + public void processBulkResponse( + BulkResponse response, + long executionId, + @Nullable ScopedCounter conflictCounter, + TupleAction action) { + + // opensearch-java: items() returns List; status() returns int + var idsToBulkItems = + response.items().stream() + .map( + bir -> { + var error = bir.error(); + boolean failed = false; + if (error != null) { + // opensearch-java: int status code, not RestStatus enum + if (bir.status() == 409) { + if (conflictCounter != null) { + conflictCounter.scope("doc_conflicts").incrBy(1); + } + log.debug("Doc conflict ID {}", bir.id()); + } else { + log.error( + "Bulk item failure ID {}: {}", + bir.id(), + error.reason() != null + ? error.reason() + : error.type()); + failed = true; + } + } + return new BulkItemResponseToFailedFlag(bir, failed); + }) + .collect( + // https://github.com/apache/stormcrawler/issues/832 + Collectors.groupingBy( + BulkItemResponseToFailedFlag::id, + Collectors.toUnmodifiableList())); + + Map> presentTuples; + long estimatedSize; + Set debugInfo = null; + lock.lock(); + try { + presentTuples = cache.getAllPresent(idsToBulkItems.keySet()); + if (!presentTuples.isEmpty()) { + cache.invalidateAll(presentTuples.keySet()); + } + estimatedSize = cache.estimatedSize(); + if (log.isDebugEnabled() && estimatedSize > 0L) { + debugInfo = new HashSet<>(cache.asMap().keySet()); + } + } finally { + lock.unlock(); + } + + int ackCount = 0; + int failureCount = 0; + + for (var entry : presentTuples.entrySet()) { + final var id = entry.getKey(); + final var tuples = entry.getValue(); + final var bulkItems = idsToBulkItems.get(id); + + BulkItemResponseToFailedFlag selected = selectBest(bulkItems, id); + + if (tuples != null) { + log.debug("Found {} tuple(s) for ID {}", tuples.size(), id); + for (Tuple t : tuples) { + if (selected.failed()) { + failureCount++; + } else { + ackCount++; + } + action.handle(id, t, selected); + } + } else { + log.warn("Could not find unacked tuples for {}", id); + } + } + + log.info( + "Bulk response [{}] : items {}, waitAck {}, acked {}, failed {}", + executionId, + idsToBulkItems.size(), + estimatedSize, + ackCount, + failureCount); + + if (debugInfo != null) { + for (String k : debugInfo) { + log.debug("Still in wait ack after bulk response [{}] => {}", executionId, k); + } + } + } + + /** + * Processes a failed bulk request by failing all associated tuples. + * + * @param failAction callback applied to each tuple that must be failed + */ + public void processFailedBulk( + BulkRequest request, long executionId, Throwable failure, Consumer failAction) { + + log.error("Exception with bulk {} - failing the whole lot ", executionId, failure); + + // opensearch-java: operations() + getBulkOperationId replaces + // legacy requests() + DocWriteRequest::id + final var failedIds = + request.operations().stream() + .map(OpenSearchConnection::getBulkOperationId) + .filter(Objects::nonNull) + .collect(Collectors.toUnmodifiableSet()); + + Map> failedTupleLists; + lock.lock(); + try { + failedTupleLists = cache.getAllPresent(failedIds); + if (!failedTupleLists.isEmpty()) { + cache.invalidateAll(failedTupleLists.keySet()); + } + } finally { + lock.unlock(); + } + + for (var id : failedIds) { + var tuples = failedTupleLists.get(id); + if (tuples != null) { + log.debug("Failed {} tuple(s) for ID {}", tuples.size(), id); + for (Tuple t : tuples) { + failAction.accept(t); + } + } else { + log.warn("Could not find unacked tuple for {}", id); + } + } + } + + /** + * Selects the best response when there are multiple bulk items for the same document ID. + * Prefers non-failed responses; warns when there is a mix of success and failure. If all items + * are failed, returns the first one (no warning logged since there is no ambiguity). + */ + private BulkItemResponseToFailedFlag selectBest( + List items, String id) { + if (items.size() == 1) { + return items.get(0); + } + + BulkItemResponseToFailedFlag best = items.get(0); + int failedCount = 0; + for (var item : items) { + if (item.failed()) { + failedCount++; + } else { + best = item; + } + } + if (failedCount > 0 && failedCount < items.size()) { + log.warn( + "The id {} would result in an ack and a failure." + + " Using only the ack for processing.", + id); + } + return best; + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java index 779c23c89..0c10dd3c2 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java @@ -17,30 +17,19 @@ package org.apache.stormcrawler.opensearch.bolt; -import com.github.benmanes.caffeine.cache.Cache; -import com.github.benmanes.caffeine.cache.Caffeine; -import com.github.benmanes.caffeine.cache.RemovalCause; -import com.github.benmanes.caffeine.cache.RemovalListener; import java.lang.invoke.MethodHandles; -import java.util.LinkedList; -import java.util.List; import java.util.Map; -import java.util.Objects; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.locks.ReentrantLock; -import java.util.stream.Collectors; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.topology.OutputFieldsDeclarer; import org.apache.storm.topology.base.BaseRichBolt; import org.apache.storm.tuple.Tuple; import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.metrics.CrawlerMetrics; import org.apache.stormcrawler.opensearch.AsyncBulkProcessor; -import org.apache.stormcrawler.opensearch.BulkItemResponseToFailedFlag; import org.apache.stormcrawler.opensearch.OpenSearchConnection; +import org.apache.stormcrawler.opensearch.WaitAckCache; import org.apache.stormcrawler.util.ConfUtils; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; import org.opensearch.client.opensearch.core.BulkRequest; import org.opensearch.client.opensearch.core.BulkResponse; import org.opensearch.client.opensearch.core.bulk.BulkOperation; @@ -52,8 +41,7 @@ * will also try to delete documents even though they were never indexed and it currently won't * delete documents which were indexed under the canonical URL. */ -public class DeletionBolt extends BaseRichBolt - implements RemovalListener>, AsyncBulkProcessor.Listener { +public class DeletionBolt extends BaseRichBolt implements AsyncBulkProcessor.Listener { static final org.slf4j.Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -66,10 +54,7 @@ public class DeletionBolt extends BaseRichBolt private OpenSearchConnection connection; - private Cache> waitAck; - - // Be fair due to cache timeout - private final ReentrantLock waitAckLock = new ReentrantLock(true); + private WaitAckCache waitAck; public DeletionBolt() {} @@ -89,38 +74,17 @@ public void prepare( try { connection = OpenSearchConnection.getConnection(conf, BOLT_TYPE, this); } catch (Exception e1) { - LOG.error("Can't connect to opensearch", e1); + LOG.error("Can't connect to OpenSearch", e1); throw new RuntimeException(e1); } - waitAck = - Caffeine.newBuilder() - .expireAfterWrite(60, TimeUnit.SECONDS) - .removalListener(this) - .build(); - - context.registerMetric("waitAck", () -> waitAck.estimatedSize(), 10); - } - - @Override - public void onRemoval( - @Nullable String key, @Nullable List value, @NotNull RemovalCause cause) { - if (!cause.wasEvicted()) { - return; - } - if (value != null) { - LOG.error("Purged from waitAck {} with {} values", key, value.size()); - for (Tuple t : value) { - _collector.fail(t); - } - } else { - // This should never happen, but log it anyway. - LOG.error("Purged from waitAck {} with no values", key); - } + waitAck = new WaitAckCache(LOG, _collector::fail); + CrawlerMetrics.registerGauge(context, conf, "waitAck", waitAck::estimatedSize, 10); } @Override public void cleanup() { + waitAck.shutdown(); if (connection != null) { connection.close(); } @@ -138,18 +102,7 @@ public void execute(Tuple tuple) { final String targetIndex = getIndexName(metadata); BulkOperation op = BulkOperation.of(b -> b.delete(d -> d.index(targetIndex).id(docID))); - waitAckLock.lock(); - try { - List tt = waitAck.getIfPresent(docID); - if (tt == null) { - tt = new LinkedList<>(); - waitAck.put(docID, tt); - } - tt.add(tuple); - LOG.debug("Added to waitAck {} with ID {} total {}", url, docID, tt.size()); - } finally { - waitAckLock.unlock(); - } + waitAck.addTuple(docID, tuple); connection.addToProcessor(op); } @@ -183,134 +136,27 @@ public void beforeBulk(long executionId, BulkRequest request) {} @Override public void afterBulk(long executionId, BulkRequest request, BulkResponse response) { - var idsToBulkItemsWithFailedFlag = - response.items().stream() - .map( - bir -> { - String id = bir.id(); - var error = bir.error(); - boolean failed = false; - if (error != null) { - if (bir.status() == 409) { - LOG.debug("Doc conflict ID {}", id); - } else { - failed = true; - } - } - return new BulkItemResponseToFailedFlag(bir, failed); - }) - .collect( - // https://github.com/apache/stormcrawler/issues/832 - Collectors.groupingBy( - idWithFailedFlagTuple -> idWithFailedFlagTuple.id, - Collectors.toUnmodifiableList())); - Map> presentTuples; - long estimatedSize; - waitAckLock.lock(); - try { - presentTuples = waitAck.getAllPresent(idsToBulkItemsWithFailedFlag.keySet()); - if (!presentTuples.isEmpty()) { - waitAck.invalidateAll(presentTuples.keySet()); - } - estimatedSize = waitAck.estimatedSize(); - } finally { - waitAckLock.unlock(); - } - - int ackCount = 0; - int failureCount = 0; - - for (var entry : presentTuples.entrySet()) { - final var id = entry.getKey(); - final var associatedTuple = entry.getValue(); - final var bulkItemsWithFailedFlag = idsToBulkItemsWithFailedFlag.get(id); - - BulkItemResponseToFailedFlag selected; - - if (bulkItemsWithFailedFlag.size() == 1) { - selected = bulkItemsWithFailedFlag.get(0); - } else { - // Fallback if there are multiple responses for the same id - BulkItemResponseToFailedFlag tmp = null; - var ctFailed = 0; - for (var buwff : bulkItemsWithFailedFlag) { - if (tmp == null) { - tmp = buwff; - } - if (buwff.failed) { - ctFailed++; - } else { - tmp = buwff; - } - } - if (ctFailed != bulkItemsWithFailedFlag.size()) { - LOG.warn( - "The id {} would result in an ack and a failure. Using only the ack for processing.", - id); - } - selected = Objects.requireNonNull(tmp); - } - - if (associatedTuple != null) { - LOG.debug("Found {} tuple(s) for ID {}", associatedTuple.size(), id); - for (Tuple t : associatedTuple) { - String url = (String) t.getValueByField("url"); - - if (!selected.failed) { - ackCount++; + waitAck.processBulkResponse( + response, + executionId, + null, + (id, t, selected) -> { + if (!selected.failed()) { _collector.ack(t); } else { - failureCount++; - var failure = selected.getFailure(); - LOG.error("update ID {}, URL {}, failure: {}", id, url, failure); + String url = (String) t.getValueByField("url"); + LOG.error( + "update ID {}, URL {}, failure: {}", + id, + url, + selected.getFailure()); _collector.fail(t); } - } - } else { - LOG.warn("Could not find unacked tuples for {}", entry.getKey()); - } - } - - LOG.info( - "Bulk response [{}] : items {}, waitAck {}, acked {}, failed {}", - executionId, - idsToBulkItemsWithFailedFlag.size(), - estimatedSize, - ackCount, - failureCount); + }); } @Override public void afterBulk(long executionId, BulkRequest request, Throwable failure) { - LOG.error("Exception with bulk {} - failing the whole lot ", executionId, failure); - - final var failedIds = - request.operations().stream() - .map(OpenSearchConnection::getBulkOperationId) - .filter(Objects::nonNull) - .collect(Collectors.toUnmodifiableSet()); - Map> failedTupleLists; - waitAckLock.lock(); - try { - failedTupleLists = waitAck.getAllPresent(failedIds); - if (!failedTupleLists.isEmpty()) { - waitAck.invalidateAll(failedTupleLists.keySet()); - } - } finally { - waitAckLock.unlock(); - } - - for (var id : failedIds) { - var failedTuples = failedTupleLists.get(id); - if (failedTuples != null) { - LOG.debug("Failed {} tuple(s) for ID {}", failedTuples.size(), id); - for (Tuple x : failedTuples) { - // fail it - _collector.fail(x); - } - } else { - LOG.warn("Could not find unacked tuple for {}", id); - } - } + waitAck.processFailedBulk(request, executionId, failure, _collector::fail); } } diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java index ce77c07d6..c98a0abab 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java @@ -19,25 +19,12 @@ import static org.apache.stormcrawler.Constants.StatusStreamName; -import com.github.benmanes.caffeine.cache.Cache; -import com.github.benmanes.caffeine.cache.Caffeine; -import com.github.benmanes.caffeine.cache.RemovalCause; -import com.github.benmanes.caffeine.cache.RemovalListener; import java.io.IOException; import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.Objects; -import java.util.Set; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.locks.ReentrantLock; -import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; -import org.apache.storm.metric.api.MultiCountMetric; -import org.apache.storm.metric.api.MultiReducedMetric; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.tuple.Tuple; @@ -45,15 +32,15 @@ import org.apache.stormcrawler.Constants; import org.apache.stormcrawler.Metadata; import org.apache.stormcrawler.indexing.AbstractIndexerBolt; +import org.apache.stormcrawler.metrics.CrawlerMetrics; +import org.apache.stormcrawler.metrics.ScopedCounter; +import org.apache.stormcrawler.metrics.ScopedReducedMetric; import org.apache.stormcrawler.opensearch.AsyncBulkProcessor; -import org.apache.stormcrawler.opensearch.BulkItemResponseToFailedFlag; import org.apache.stormcrawler.opensearch.IndexCreation; import org.apache.stormcrawler.opensearch.OpenSearchConnection; +import org.apache.stormcrawler.opensearch.WaitAckCache; import org.apache.stormcrawler.persistence.Status; import org.apache.stormcrawler.util.ConfUtils; -import org.apache.stormcrawler.util.PerSecondReducer; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; import org.opensearch.client.opensearch.core.BulkRequest; import org.opensearch.client.opensearch.core.BulkResponse; import org.opensearch.client.opensearch.core.bulk.BulkOperation; @@ -64,8 +51,7 @@ * Sends documents to opensearch. Indexes all the fields from the tuples or a Map * <String,Object> from a named field. */ -public class IndexerBolt extends AbstractIndexerBolt - implements RemovalListener>, AsyncBulkProcessor.Listener { +public class IndexerBolt extends AbstractIndexerBolt implements AsyncBulkProcessor.Listener { private static final Logger LOG = LoggerFactory.getLogger(IndexerBolt.class); @@ -88,16 +74,13 @@ public class IndexerBolt extends AbstractIndexerBolt // overwritten private boolean create = false; - private MultiCountMetric eventCounter; + private ScopedCounter eventCounter; private OpenSearchConnection connection; - private MultiReducedMetric perSecMetrics; + private ScopedReducedMetric perSecMetrics; - private Cache> waitAck; - - // Be fair due to cache timeout - private final ReentrantLock waitAckLock = new ReentrantLock(true); + private WaitAckCache waitAck; public IndexerBolt() {} @@ -121,25 +104,17 @@ public void prepare( try { connection = OpenSearchConnection.getConnection(conf, OSBoltType, this); } catch (Exception e1) { - LOG.error("Can't connect to opensearch", e1); + LOG.error("Can't connect to OpenSearch", e1); throw new RuntimeException(e1); } - this.eventCounter = context.registerMetric("OpensearchIndexer", new MultiCountMetric(), 10); + this.eventCounter = CrawlerMetrics.registerCounter(context, conf, "OpensearchIndexer", 10); this.perSecMetrics = - context.registerMetric( - "Indexer_average_persec", - new MultiReducedMetric(new PerSecondReducer()), - 10); - - waitAck = - Caffeine.newBuilder() - .expireAfterWrite(60, TimeUnit.SECONDS) - .removalListener(this) - .build(); + CrawlerMetrics.registerPerSecMetric(context, conf, "Indexer_average_persec", 10); - context.registerMetric("waitAck", () -> waitAck.estimatedSize(), 10); + waitAck = new WaitAckCache(LOG, _collector::fail); + CrawlerMetrics.registerGauge(context, conf, "waitAck", waitAck::estimatedSize, 10); // use the default status schema if none has been specified try { @@ -149,24 +124,9 @@ public void prepare( } } - public void onRemoval( - @Nullable String key, @Nullable List value, @NotNull RemovalCause cause) { - if (!cause.wasEvicted()) { - return; - } - if (value != null) { - LOG.error("Purged from waitAck {} with {} values", key, value.size()); - for (Tuple t : value) { - _collector.fail(t); - } - } else { - // This should never happen, but log it anyway. - LOG.error("Purged from waitAck {} with no values", key); - } - } - @Override public void cleanup() { + waitAck.shutdown(); if (connection != null) { connection.close(); } @@ -257,18 +217,7 @@ public void execute(Tuple tuple) { })); } - waitAckLock.lock(); - try { - List tt = waitAck.getIfPresent(docID); - if (tt == null) { - tt = new LinkedList<>(); - waitAck.put(docID, tt); - } - tt.add(tuple); - LOG.debug("Added to waitAck {} with ID {} total {}", url, docID, tt.size()); - } finally { - waitAckLock.unlock(); - } + waitAck.addTuple(docID, tuple); connection.addToProcessor(op); @@ -279,12 +228,7 @@ public void execute(Tuple tuple) { // do not send to status stream so that it gets replayed _collector.fail(tuple); - waitAckLock.lock(); - try { - waitAck.invalidate(docID); - } finally { - waitAckLock.unlock(); - } + waitAck.invalidate(docID); } } @@ -306,95 +250,19 @@ public void afterBulk(long executionId, BulkRequest request, BulkResponse respon eventCounter.scope("bulks_received").incrBy(1); eventCounter.scope("bulk_msec").incrBy(response.took()); - var idsToBulkItemsWithFailedFlag = - response.items().stream() - .map( - bir -> { - String id = bir.id(); - var error = bir.error(); - boolean failed = false; - if (error != null) { - if (bir.status() == 409) { - eventCounter.scope("doc_conflicts").incrBy(1); - LOG.debug("Doc conflict ID {}", id); - } else { - failed = true; - } - } - return new BulkItemResponseToFailedFlag(bir, failed); - }) - .collect( - // https://github.com/apache/stormcrawler/issues/832 - Collectors.groupingBy( - idWithFailedFlagTuple -> idWithFailedFlagTuple.id, - Collectors.toUnmodifiableList())); - - Map> presentTuples; - long estimatedSize; - Set debugInfo = null; - waitAckLock.lock(); - try { - presentTuples = waitAck.getAllPresent(idsToBulkItemsWithFailedFlag.keySet()); - if (!presentTuples.isEmpty()) { - waitAck.invalidateAll(presentTuples.keySet()); - } - estimatedSize = waitAck.estimatedSize(); - // Only if we have to. - if (LOG.isDebugEnabled() && estimatedSize > 0L) { - debugInfo = new HashSet<>(waitAck.asMap().keySet()); - } - } finally { - waitAckLock.unlock(); - } - - int ackCount = 0; - int failureCount = 0; - - for (var entry : presentTuples.entrySet()) { - final var id = entry.getKey(); - final var associatedTuple = entry.getValue(); - final var bulkItemsWithFailedFlag = idsToBulkItemsWithFailedFlag.get(id); - - BulkItemResponseToFailedFlag selected; - - if (bulkItemsWithFailedFlag.size() == 1) { - selected = bulkItemsWithFailedFlag.get(0); - } else { - // Fallback if there are multiple responses for the same id - BulkItemResponseToFailedFlag tmp = null; - var ctFailed = 0; - for (var buwff : bulkItemsWithFailedFlag) { - if (tmp == null) { - tmp = buwff; - } - if (buwff.failed) { - ctFailed++; - } else { - tmp = buwff; - } - } - if (ctFailed != bulkItemsWithFailedFlag.size()) { - LOG.warn( - "The id {} would result in an ack and a failure. Using only the ack for processing.", - id); - } - selected = Objects.requireNonNull(tmp); - } - - if (associatedTuple != null) { - LOG.debug("Found {} tuple(s) for ID {}", associatedTuple.size(), id); - for (Tuple t : associatedTuple) { + waitAck.processBulkResponse( + response, + executionId, + eventCounter, + (id, t, selected) -> { String url = (String) t.getValueByField("url"); - Metadata metadata = (Metadata) t.getValueByField("metadata"); - if (!selected.failed) { - ackCount++; + if (!selected.failed()) { _collector.emit( StatusStreamName, t, new Values(url, metadata, Status.FETCHED)); _collector.ack(t); } else { - failureCount++; var failure = selected.getFailure(); LOG.error("update ID {}, URL {}, failure: {}", id, url, failure); // there is something wrong with the content we should @@ -407,64 +275,23 @@ public void afterBulk(long executionId, BulkRequest request, BulkResponse respon _collector.ack(t); LOG.debug("Acked {} with ID {}", url, id); } else { - // otherwise just fail it _collector.fail(t); LOG.debug("Failed {} with ID {}", url, id); } } - } - } else { - LOG.warn("Could not find unacked tuples for {}", entry.getKey()); - } - } - - LOG.info( - "Bulk response [{}] : items {}, waitAck {}, acked {}, failed {}", - executionId, - idsToBulkItemsWithFailedFlag.size(), - estimatedSize, - ackCount, - failureCount); - if (debugInfo != null) { - for (String kinaw : debugInfo) { - LOG.debug("Still in wait ack after bulk response [{}] => {}", executionId, kinaw); - } - } + }); } @Override public void afterBulk(long executionId, BulkRequest request, Throwable failure) { eventCounter.scope("bulks_received").incrBy(1); - LOG.error("Exception with bulk {} - failing the whole lot ", executionId, failure); - - final var failedIds = - request.operations().stream() - .map(OpenSearchConnection::getBulkOperationId) - .filter(Objects::nonNull) - .collect(Collectors.toUnmodifiableSet()); - Map> failedTupleLists; - waitAckLock.lock(); - try { - failedTupleLists = waitAck.getAllPresent(failedIds); - if (!failedTupleLists.isEmpty()) { - waitAck.invalidateAll(failedTupleLists.keySet()); - } - } finally { - waitAckLock.unlock(); - } - - for (var id : failedIds) { - var failedTuples = failedTupleLists.get(id); - if (failedTuples != null) { - LOG.debug("Failed {} tuple(s) for ID {}", failedTuples.size(), id); - for (Tuple x : failedTuples) { - // fail it + waitAck.processFailedBulk( + request, + executionId, + failure, + t -> { eventCounter.scope("failed").incrBy(1); - _collector.fail(x); - } - } else { - LOG.warn("Could not find unacked tuple for {}", id); - } - } + _collector.fail(t); + }); } } diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/filtering/JSONURLFilterWrapper.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/filtering/JSONURLFilterWrapper.java index d983bb0cc..49ee7f0ca 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/filtering/JSONURLFilterWrapper.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/filtering/JSONURLFilterWrapper.java @@ -18,24 +18,13 @@ package org.apache.stormcrawler.opensearch.filtering; import com.fasterxml.jackson.databind.JsonNode; -import java.io.ByteArrayInputStream; -import java.io.IOException; import java.net.URL; -import java.nio.charset.StandardCharsets; import java.util.Map; -import java.util.Timer; -import java.util.TimerTask; -import org.apache.stormcrawler.JSONResource; import org.apache.stormcrawler.Metadata; import org.apache.stormcrawler.filtering.URLFilter; -import org.apache.stormcrawler.opensearch.OpenSearchConnection; +import org.apache.stormcrawler.opensearch.DelegateRefresher; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; -import org.opensearch.client.json.JsonData; -import org.opensearch.client.opensearch.OpenSearchClient; -import org.opensearch.client.opensearch.core.GetResponse; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Wraps a URLFilter whose resources are in a JSON file that can be stored in OpenSearch. The @@ -69,101 +58,12 @@ */ public class JSONURLFilterWrapper extends URLFilter { - private static final Logger LOG = LoggerFactory.getLogger(JSONURLFilterWrapper.class); - - private URLFilter delegatedURLFilter; - private Timer refreshTimer; - private OpenSearchClient osClient; + private DelegateRefresher refresher; public void configure(@NotNull Map stormConf, @NotNull JsonNode filterParams) { - - String urlfilterclass = null; - - JsonNode delegateNode = filterParams.get("delegate"); - if (delegateNode == null) { - throw new RuntimeException("delegateNode undefined!"); - } - - JsonNode node = delegateNode.get("class"); - if (node != null && node.isTextual()) { - urlfilterclass = node.asText(); - } - - if (urlfilterclass == null) { - throw new RuntimeException("urlfilter.class undefined!"); - } - - // load an instance of the delegated parsefilter - try { - Class filterClass = Class.forName(urlfilterclass); - - boolean subClassOK = URLFilter.class.isAssignableFrom(filterClass); - if (!subClassOK) { - throw new RuntimeException( - "Filter " + urlfilterclass + " does not extend URLFilter"); - } - - delegatedURLFilter = (URLFilter) filterClass.getDeclaredConstructor().newInstance(); - - // check that it implements JSONResource - if (!JSONResource.class.isInstance(delegatedURLFilter)) { - throw new RuntimeException( - "Filter " + urlfilterclass + " does not implement JSONResource"); - } - - } catch (Exception e) { - LOG.error("Can't setup {}: {}", urlfilterclass, e); - throw new RuntimeException("Can't setup " + urlfilterclass, e); - } - - // configure it - node = delegateNode.get("params"); - - delegatedURLFilter.configure(stormConf, node); - - int refreshRate = 600; - - node = filterParams.get("refresh"); - if (node != null && node.isInt()) { - refreshRate = node.asInt(refreshRate); - } - - final JSONResource resource = (JSONResource) delegatedURLFilter; - - refreshTimer = new Timer(); - refreshTimer.schedule( - new TimerTask() { - public void run() { - if (osClient == null) { - try { - osClient = OpenSearchConnection.getClient(stormConf, "config"); - } catch (Exception e) { - LOG.error("Exception while creating OpenSearch connection", e); - } - } - if (osClient != null) { - LOG.info("Reloading json resources from OpenSearch"); - try { - GetResponse response = - osClient.get( - g -> - g.index("config") - .id(resource.getResourceFile()), - JsonData.class); - if (response.found() && response.source() != null) { - String json = response.source().toJson().toString(); - resource.loadJSONResources( - new ByteArrayInputStream( - json.getBytes(StandardCharsets.UTF_8))); - } - } catch (Exception e) { - LOG.error("Can't load config from OpenSearch", e); - } - } - } - }, - 0, - refreshRate * 1000); + refresher = + new DelegateRefresher<>( + URLFilter.class, stormConf, filterParams, URLFilter::configure); } @Override @@ -171,20 +71,13 @@ public void run() { @Nullable URL sourceUrl, @Nullable Metadata sourceMetadata, @NotNull String urlToFilter) { - return delegatedURLFilter.filter(sourceUrl, sourceMetadata, urlToFilter); + return refresher.getDelegate().filter(sourceUrl, sourceMetadata, urlToFilter); } @Override public void cleanup() { - if (refreshTimer != null) { - refreshTimer.cancel(); - } - if (osClient != null) { - try { - osClient._transport().close(); - } catch (IOException e) { - LOG.error("Exception when closing OpenSearch client", e); - } + if (refresher != null) { + refresher.cleanup(); } } } diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsReporter.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsReporter.java new file mode 100644 index 000000000..d3b654a03 --- /dev/null +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsReporter.java @@ -0,0 +1,219 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.metrics; + +import com.codahale.metrics.Counter; +import com.codahale.metrics.Gauge; +import com.codahale.metrics.Histogram; +import com.codahale.metrics.Meter; +import com.codahale.metrics.MetricFilter; +import com.codahale.metrics.MetricRegistry; +import com.codahale.metrics.ScheduledReporter; +import com.codahale.metrics.Timer; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; +import java.util.SortedMap; +import java.util.concurrent.TimeUnit; +import org.apache.storm.metrics2.reporters.ScheduledStormReporter; +import org.apache.stormcrawler.opensearch.IndexCreation; +import org.apache.stormcrawler.opensearch.OpenSearchConnection; +import org.apache.stormcrawler.util.ConfUtils; +import org.opensearch.client.opensearch.core.bulk.BulkOperation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Storm V2 metrics reporter that writes metrics to an OpenSearch index with the same document + * structure as the V1 {@link MetricsConsumer}. This allows existing OpenSearch dashboards to work + * unchanged during migration from V1 to V2 metrics. + * + *

Configuration in storm.yaml: + * + *

+ *   storm.metrics.reporters:
+ *     - class: "org.apache.stormcrawler.opensearch.metrics.MetricsReporter"
+ *       report.period: 10
+ *       report.period.units: "SECONDS"
+ * 
+ */ +public class MetricsReporter extends ScheduledStormReporter { + + private static final Logger LOG = LoggerFactory.getLogger(MetricsReporter.class); + + private static final String OSBoltType = "metrics"; + + private static final String OSMetricsIndexNameParamName = + "opensearch." + OSBoltType + ".index.name"; + + private static final String DATE_FORMAT_KEY = "opensearch.metrics.date.format"; + + private ScheduledReporter reporter; + + @Override + public void prepare( + MetricRegistry metricsRegistry, + Map topoConf, + Map reporterConf) { + + String indexName = ConfUtils.getString(topoConf, OSMetricsIndexNameParamName, "metrics"); + String stormId = (String) topoConf.getOrDefault("storm.id", "unknown"); + + SimpleDateFormat dateFormat = null; + String dateFormatStr = ConfUtils.getString(topoConf, DATE_FORMAT_KEY, null); + if (dateFormatStr != null) { + dateFormat = new SimpleDateFormat(dateFormatStr, Locale.ROOT); + } + + OpenSearchConnection connection; + try { + connection = OpenSearchConnection.getConnection(topoConf, OSBoltType); + } catch (Exception e) { + LOG.error("Can't connect to OpenSearch", e); + throw new RuntimeException(e); + } + + try { + IndexCreation.checkOrCreateIndexTemplate(connection.getClient(), OSBoltType, LOG); + } catch (IOException e) { + throw new RuntimeException(e); + } + + TimeUnit reportPeriodUnit = getReportPeriodUnit(reporterConf); + long reportPeriod = getReportPeriod(reporterConf); + + reporter = + new OpenSearchScheduledReporter( + metricsRegistry, indexName, stormId, dateFormat, connection); + + reporter.start(reportPeriod, reportPeriodUnit); + } + + @Override + public void start() { + // already started in prepare() + } + + @Override + public void stop() { + if (reporter != null) { + reporter.stop(); + } + } + + /** + * Inner ScheduledReporter that writes Codahale metrics to OpenSearch in the same format as the + * V1 {@link MetricsConsumer}. + */ + private static class OpenSearchScheduledReporter extends ScheduledReporter { + + private final String indexName; + private final String stormId; + private final SimpleDateFormat dateFormat; + private final OpenSearchConnection connection; + + OpenSearchScheduledReporter( + MetricRegistry registry, + String indexName, + String stormId, + SimpleDateFormat dateFormat, + OpenSearchConnection connection) { + super( + registry, + "opensearch-metrics-reporter", + MetricFilter.ALL, + TimeUnit.SECONDS, + TimeUnit.MILLISECONDS); + this.indexName = indexName; + this.stormId = stormId; + this.dateFormat = dateFormat; + this.connection = connection; + } + + @Override + @SuppressWarnings("rawtypes") + public void report( + SortedMap gauges, + SortedMap counters, + SortedMap histograms, + SortedMap meters, + SortedMap timers) { + + Date now = new Date(); + + for (Map.Entry entry : gauges.entrySet()) { + Object value = entry.getValue().getValue(); + if (value instanceof Number) { + indexDataPoint(now, entry.getKey(), ((Number) value).doubleValue()); + } else if (value instanceof Map) { + for (Map.Entry mapEntry : ((Map) value).entrySet()) { + if (mapEntry.getValue() instanceof Number) { + indexDataPoint( + now, + entry.getKey() + "." + mapEntry.getKey(), + ((Number) mapEntry.getValue()).doubleValue()); + } + } + } + } + + for (Map.Entry entry : counters.entrySet()) { + indexDataPoint(now, entry.getKey(), entry.getValue().getCount()); + } + + for (Map.Entry entry : histograms.entrySet()) { + indexDataPoint(now, entry.getKey(), entry.getValue().getSnapshot().getMean()); + } + + for (Map.Entry entry : meters.entrySet()) { + indexDataPoint(now, entry.getKey(), entry.getValue().getOneMinuteRate()); + } + + for (Map.Entry entry : timers.entrySet()) { + indexDataPoint(now, entry.getKey(), entry.getValue().getSnapshot().getMean()); + } + } + + private String getIndexName(Date timestamp) { + if (dateFormat == null) { + return indexName; + } + return indexName + "-" + dateFormat.format(timestamp); + } + + private void indexDataPoint(Date timestamp, String name, double value) { + try { + Map doc = new HashMap<>(); + doc.put("stormId", stormId); + doc.put("name", name); + doc.put("value", value); + doc.put("timestamp", timestamp.toInstant().toString()); + + final String targetIndex = getIndexName(timestamp); + BulkOperation op = + BulkOperation.of(b -> b.index(idx -> idx.index(targetIndex).document(doc))); + connection.addToProcessor(op); + } catch (Exception e) { + LOG.error("Problem when building request for OpenSearch", e); + } + } + } +} diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java index 697dd17a6..c74184c22 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/metrics/StatusMetricsBolt.java @@ -29,6 +29,7 @@ import org.apache.storm.topology.base.BaseRichBolt; import org.apache.storm.tuple.Tuple; import org.apache.storm.utils.TupleUtils; +import org.apache.stormcrawler.metrics.CrawlerMetrics; import org.apache.stormcrawler.opensearch.Constants; import org.apache.stormcrawler.opensearch.OpenSearchConnection; import org.apache.stormcrawler.util.ConfUtils; @@ -82,12 +83,8 @@ public void prepare( throw new RuntimeException(e1); } - context.registerMetric( - "status.count", - () -> { - return latestStatusCounts; - }, - freqStats); + CrawlerMetrics.registerGauge( + context, stormConf, "status.count", () -> latestStatusCounts, freqStats); counters = new StatusCounter[6]; diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java index b96563e86..a5946cea3 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/parse/filter/JSONResourceWrapper.java @@ -18,22 +18,11 @@ package org.apache.stormcrawler.opensearch.parse.filter; import com.fasterxml.jackson.databind.JsonNode; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.nio.charset.StandardCharsets; import java.util.Map; -import java.util.Timer; -import java.util.TimerTask; -import org.apache.stormcrawler.JSONResource; -import org.apache.stormcrawler.opensearch.OpenSearchConnection; +import org.apache.stormcrawler.opensearch.DelegateRefresher; import org.apache.stormcrawler.parse.ParseFilter; import org.apache.stormcrawler.parse.ParseResult; import org.jetbrains.annotations.NotNull; -import org.opensearch.client.json.JsonData; -import org.opensearch.client.opensearch.OpenSearchClient; -import org.opensearch.client.opensearch.core.GetResponse; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.w3c.dom.DocumentFragment; /** @@ -68,119 +57,23 @@ */ public class JSONResourceWrapper extends ParseFilter { - private static final Logger LOG = LoggerFactory.getLogger(JSONResourceWrapper.class); - - private ParseFilter delegatedParseFilter; - private Timer refreshTimer; - private OpenSearchClient osClient; + private DelegateRefresher refresher; public void configure(@NotNull Map stormConf, @NotNull JsonNode filterParams) { - - String parsefilterclass = null; - - JsonNode delegateNode = filterParams.get("delegate"); - if (delegateNode == null) { - throw new RuntimeException("delegateNode undefined!"); - } - - JsonNode node = delegateNode.get("class"); - if (node != null && node.isTextual()) { - parsefilterclass = node.asText(); - } - - if (parsefilterclass == null) { - throw new RuntimeException("parsefilter.class undefined!"); - } - - // load an instance of the delegated parsefilter - try { - Class filterClass = Class.forName(parsefilterclass); - - boolean subClassOK = ParseFilter.class.isAssignableFrom(filterClass); - if (!subClassOK) { - throw new RuntimeException( - "Filter " + parsefilterclass + " does not extend ParseFilter"); - } - - delegatedParseFilter = (ParseFilter) filterClass.getDeclaredConstructor().newInstance(); - - // check that it implements JSONResource - if (!JSONResource.class.isInstance(delegatedParseFilter)) { - throw new RuntimeException( - "Filter " + parsefilterclass + " does not implement JSONResource"); - } - - } catch (Exception e) { - LOG.error("Can't setup {}: {}", parsefilterclass, e); - throw new RuntimeException("Can't setup " + parsefilterclass, e); - } - - // configure it - node = delegateNode.get("params"); - - delegatedParseFilter.configure(stormConf, node); - - int refreshRate = 600; - - node = filterParams.get("refresh"); - if (node != null && node.isInt()) { - refreshRate = node.asInt(refreshRate); - } - - final JSONResource resource = (JSONResource) delegatedParseFilter; - - refreshTimer = new Timer(); - refreshTimer.schedule( - new TimerTask() { - public void run() { - if (osClient == null) { - try { - osClient = OpenSearchConnection.getClient(stormConf, "config"); - } catch (Exception e) { - LOG.error("Exception while creating OpenSearch connection", e); - } - } - if (osClient != null) { - LOG.info("Reloading json resources from OpenSearch"); - try { - GetResponse response = - osClient.get( - g -> - g.index("config") - .id(resource.getResourceFile()), - JsonData.class); - if (response.found() && response.source() != null) { - String json = response.source().toJson().toString(); - resource.loadJSONResources( - new ByteArrayInputStream( - json.getBytes(StandardCharsets.UTF_8))); - } - } catch (Exception e) { - LOG.error("Can't load config from OpenSearch", e); - } - } - } - }, - 0, - refreshRate * 1000); + refresher = + new DelegateRefresher<>( + ParseFilter.class, stormConf, filterParams, ParseFilter::configure); } @Override public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) { - delegatedParseFilter.filter(URL, content, doc, parse); + refresher.getDelegate().filter(URL, content, doc, parse); } @Override public void cleanup() { - if (refreshTimer != null) { - refreshTimer.cancel(); - } - if (osClient != null) { - try { - osClient._transport().close(); - } catch (IOException e) { - LOG.error("Exception when closing OpenSearch client", e); - } + if (refresher != null) { + refresher.cleanup(); } } } diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AbstractSpout.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AbstractSpout.java index 6cd315d38..96a8f87f3 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AbstractSpout.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AbstractSpout.java @@ -114,7 +114,7 @@ public void open( client = OpenSearchConnection.getClient(stormConf, OSBoltType); } } catch (Exception e1) { - LOG.error("Can't connect to ElasticSearch", e1); + LOG.error("Can't connect to OpenSearch", e1); throw new RuntimeException(e1); } diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java index 62bc6faeb..6e29a90c8 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java @@ -40,6 +40,7 @@ import org.apache.stormcrawler.opensearch.Constants; import org.apache.stormcrawler.util.ConfUtils; import org.opensearch.client.json.JsonData; +import org.opensearch.client.opensearch._types.FieldValue; import org.opensearch.client.opensearch._types.SortOrder; import org.opensearch.client.opensearch._types.aggregations.Aggregate; import org.opensearch.client.opensearch._types.aggregations.Aggregation; @@ -215,7 +216,7 @@ protected void populateBuffer() { // dump query to log LOG.debug("{} OpenSearch query {}", logIdprefix, request); - LOG.trace("{} isInquery set to true", logIdprefix); + LOG.trace("{} isInQuery set to true", logIdprefix); isInQuery.set(true); CompletableFuture.supplyAsync( @@ -281,8 +282,7 @@ protected void handleResponse(SearchResponse response) { int hitsForThisBucket = 0; - List lastSortValues = null; - + List lastSortValues = null; // filter results so that we don't include URLs we are already // being processed TopHitsAggregate topHits = entry.aggregations().get("docs").topHits(); @@ -338,7 +338,7 @@ protected void handleResponse(SearchResponse response) { numhits += hitsForThisBucket; LOG.debug( - "{} key [{}], hits[{}], doc_count [{}]", + "{} key [{}], hits[{}], doc_count [{}], already_processed [{}]", logIdprefix, key, hitsForThisBucket, @@ -355,10 +355,10 @@ protected void handleResponse(SearchResponse response) { alreadyprocessed, ((float) timeTaken / numhits)); - queryTimes.addMeasurement(timeTaken); + queryTimes.accept(timeTaken); eventCounter.scope("already_being_processed").incrBy(alreadyprocessed); - eventCounter.scope("ES_queries").incrBy(1); - eventCounter.scope("ES_docs").incrBy(numhits); + eventCounter.scope("OpenSearch_queries").incrBy(1); + eventCounter.scope("OpenSearch_docs").incrBy(numhits); // optimise the nextFetchDate by getting the most recent value // returned in the query and add to it, unless the previous value is diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/HybridSpout.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/HybridSpout.java index fd600f0af..5ad703e5d 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/HybridSpout.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/HybridSpout.java @@ -142,7 +142,7 @@ public void emptyQueue(String queueName) { Object[] searchAfterValues = searchAfterCache.getIfPresent(queueName); if (searchAfterValues != null) { for (Object sav : searchAfterValues) { - requestBuilder.searchAfter(sav.toString()); + requestBuilder.searchAfter(FieldValue.of(sav.toString())); } } diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java index a3f1d1abf..93626a92b 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java @@ -17,42 +17,30 @@ package org.apache.stormcrawler.opensearch.persistence; -import com.github.benmanes.caffeine.cache.Cache; -import com.github.benmanes.caffeine.cache.Caffeine; -import com.github.benmanes.caffeine.cache.RemovalCause; -import com.github.benmanes.caffeine.cache.RemovalListener; import java.io.IOException; import java.util.Date; import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.Objects; import java.util.Optional; -import java.util.Set; -import java.util.concurrent.locks.ReentrantLock; -import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; -import org.apache.storm.metric.api.MultiCountMetric; -import org.apache.storm.metric.api.MultiReducedMetric; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.tuple.Tuple; import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.metrics.CrawlerMetrics; +import org.apache.stormcrawler.metrics.ScopedCounter; +import org.apache.stormcrawler.metrics.ScopedReducedMetric; import org.apache.stormcrawler.opensearch.AsyncBulkProcessor; -import org.apache.stormcrawler.opensearch.BulkItemResponseToFailedFlag; import org.apache.stormcrawler.opensearch.Constants; import org.apache.stormcrawler.opensearch.IndexCreation; import org.apache.stormcrawler.opensearch.OpenSearchConnection; +import org.apache.stormcrawler.opensearch.WaitAckCache; import org.apache.stormcrawler.persistence.AbstractStatusUpdaterBolt; import org.apache.stormcrawler.persistence.Status; import org.apache.stormcrawler.util.ConfUtils; -import org.apache.stormcrawler.util.PerSecondReducer; import org.apache.stormcrawler.util.URLPartitioner; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; import org.opensearch.client.opensearch.core.BulkRequest; import org.opensearch.client.opensearch.core.BulkResponse; import org.opensearch.client.opensearch.core.bulk.BulkOperation; @@ -64,7 +52,7 @@ * 'status' stream. To be used in combination with a Spout to read from the index. */ public class StatusUpdaterBolt extends AbstractStatusUpdaterBolt - implements RemovalListener>, AsyncBulkProcessor.Listener { + implements AsyncBulkProcessor.Listener { private static final Logger LOG = LoggerFactory.getLogger(StatusUpdaterBolt.class); @@ -90,14 +78,11 @@ public class StatusUpdaterBolt extends AbstractStatusUpdaterBolt private OpenSearchConnection connection; - private Cache> waitAck; + private WaitAckCache waitAck; - // Be fair due to cache timeout - private final ReentrantLock waitAckLock = new ReentrantLock(true); + private ScopedCounter eventCounter; - private MultiCountMetric eventCounter; - - private MultiReducedMetric receivedPerSecMetrics; + private ScopedReducedMetric receivedPerSecMetrics; public StatusUpdaterBolt() { super(); @@ -155,6 +140,18 @@ public void prepare( fieldNameForRoutingKey = fieldNameForRoutingKey.replaceAll("\\.", "%2E"); } + int metrics_time_bucket_secs = 30; + + // benchmarking - average number of items received back from OpenSearch per second + this.receivedPerSecMetrics = + CrawlerMetrics.registerPerSecMetric( + context, stormConf, "average_persec", metrics_time_bucket_secs); + + // eventCounter MUST be registered before WaitAckCache — the eviction lambda captures it + this.eventCounter = + CrawlerMetrics.registerCounter( + context, stormConf, "counters", metrics_time_bucket_secs); + String defaultSpec = String.format( Locale.ROOT, @@ -164,23 +161,16 @@ public void prepare( String waitAckSpec = ConfUtils.getString(stormConf, "opensearch.status.waitack.cache.spec", defaultSpec); - waitAck = Caffeine.from(waitAckSpec).removalListener(this).build(); - - int metrics_time_bucket_secs = 30; - - // create gauge for waitAck - context.registerMetric("waitAck", () -> waitAck.estimatedSize(), metrics_time_bucket_secs); - - // benchmarking - average number of items received back by Elastic per second - this.receivedPerSecMetrics = - context.registerMetric( - "average_persec", - new MultiReducedMetric(new PerSecondReducer()), - metrics_time_bucket_secs); - - this.eventCounter = - context.registerMetric( - "counters", new MultiCountMetric(), metrics_time_bucket_secs); + waitAck = + new WaitAckCache( + waitAckSpec, + LOG, + t -> { + eventCounter.scope("purged").incrBy(1); + collector.fail(t); + }); + CrawlerMetrics.registerGauge( + context, stormConf, "waitAck", waitAck::estimatedSize, metrics_time_bucket_secs); try { connection = OpenSearchConnection.getConnection(stormConf, OSBoltType, this); @@ -199,6 +189,7 @@ public void prepare( @Override public void cleanup() { + waitAck.shutdown(); if (connection == null) { return; } @@ -213,17 +204,8 @@ public void store( String documentID = getDocumentID(metadata, url); - boolean isAlreadySentAndDiscovered; - // need to synchronize: otherwise it might get added to the cache - // without having been sent to OpenSearch - waitAckLock.lock(); - try { - // check that the same URL is not being sent to OpenSearch - final var alreadySent = waitAck.getIfPresent(documentID); - isAlreadySentAndDiscovered = status.equals(Status.DISCOVERED) && alreadySent != null; - } finally { - waitAckLock.unlock(); - } + boolean isAlreadySentAndDiscovered = + status.equals(Status.DISCOVERED) && waitAck.contains(documentID); if (isAlreadySentAndDiscovered) { // if this object is discovered - adding another version of it @@ -305,33 +287,13 @@ public void store( })); } - waitAckLock.lock(); - try { - final List tt = waitAck.get(documentID, k -> new LinkedList<>()); - tt.add(tuple); - LOG.debug("Added to waitAck {} with ID {} total {}", url, documentID, tt.size()); - } finally { - waitAckLock.unlock(); - } + waitAck.addTuple(documentID, tuple); LOG.debug("Sending to OpenSearch buffer {} with ID {}", url, documentID); connection.addToProcessor(op); } - @Override - public void onRemoval( - @Nullable String key, @Nullable List value, @NotNull RemovalCause cause) { - if (!cause.wasEvicted()) { - return; - } - LOG.error("Purged from waitAck {} with {} values", key, value.size()); - for (Tuple t : value) { - eventCounter.scope("purged").incrBy(1); - collector.fail(t); - } - } - @Override public void afterBulk(long executionId, BulkRequest request, BulkResponse response) { LOG.debug("afterBulk [{}] with {} responses", executionId, request.operations().size()); @@ -340,120 +302,21 @@ public void afterBulk(long executionId, BulkRequest request, BulkResponse respon eventCounter.scope("received").incrBy(request.operations().size()); receivedPerSecMetrics.scope("received").update(request.operations().size()); - var idsToBulkItemsWithFailedFlag = - response.items().stream() - .map( - bir -> { - String id = bir.id(); - var error = bir.error(); - boolean failed = false; - if (error != null) { - // already discovered - if (bir.status() == 409) { - eventCounter.scope("doc_conflicts").incrBy(1); - LOG.debug("Doc conflict ID {}", id); - } else { - LOG.error( - "Update ID {}, failure: {}", - id, - error.reason() != null - ? error.reason() - : "unknown"); - failed = true; - } - } - return new BulkItemResponseToFailedFlag(bir, failed); - }) - .collect( - // https://github.com/apache/stormcrawler/issues/832 - Collectors.groupingBy( - idWithFailedFlagTuple -> idWithFailedFlagTuple.id, - Collectors.toUnmodifiableList())); - - Map> presentTuples; - long estimatedSize; - Set debugInfo = null; - waitAckLock.lock(); - try { - presentTuples = waitAck.getAllPresent(idsToBulkItemsWithFailedFlag.keySet()); - if (!presentTuples.isEmpty()) { - waitAck.invalidateAll(presentTuples.keySet()); - } - estimatedSize = waitAck.estimatedSize(); - // Only if we have to. - if (LOG.isDebugEnabled() && estimatedSize > 0L) { - debugInfo = new HashSet<>(waitAck.asMap().keySet()); - } - } finally { - waitAckLock.unlock(); - } - - int ackCount = 0; - int failureCount = 0; - - for (var entry : presentTuples.entrySet()) { - final var id = entry.getKey(); - final var associatedTuple = entry.getValue(); - final var bulkItemsWithFailedFlag = idsToBulkItemsWithFailedFlag.get(id); - - BulkItemResponseToFailedFlag selected; - if (bulkItemsWithFailedFlag.size() == 1) { - selected = bulkItemsWithFailedFlag.get(0); - } else { - // Fallback if there are multiple responses for the same id - BulkItemResponseToFailedFlag tmp = null; - var ctFailed = 0; - for (var buwff : bulkItemsWithFailedFlag) { - if (tmp == null) { - tmp = buwff; - } - if (buwff.failed) { - ctFailed++; - } else { - tmp = buwff; - } - } - if (ctFailed != bulkItemsWithFailedFlag.size()) { - LOG.warn( - "The id {} would result in an ack and a failure. Using only the ack for processing.", - id); - } - selected = Objects.requireNonNull(tmp); - } - - if (associatedTuple != null) { - LOG.debug("Acked {} tuple(s) for ID {}", associatedTuple.size(), id); - for (Tuple tuple : associatedTuple) { - if (!selected.failed) { + waitAck.processBulkResponse( + response, + executionId, + eventCounter, + (id, tuple, selected) -> { + if (!selected.failed()) { String url = tuple.getStringByField("url"); - ackCount++; - // ack and put in cache LOG.debug("Acked {} with ID {}", url, id); eventCounter.scope("acked").incrBy(1); super.ack(tuple, url); } else { - failureCount++; eventCounter.scope("failed").incrBy(1); collector.fail(tuple); } - } - } else { - LOG.warn("Could not find unacked tuple for {}", id); - } - } - - LOG.info( - "Bulk response [{}] : items {}, waitAck {}, acked {}, failed {}", - executionId, - idsToBulkItemsWithFailedFlag.size(), - estimatedSize, - ackCount, - failureCount); - if (debugInfo != null) { - for (String kinaw : debugInfo) { - LOG.debug("Still in wait ack after bulk response [{}] => {}", executionId, kinaw); - } - } + }); } @Override @@ -461,37 +324,15 @@ public void afterBulk(long executionId, BulkRequest request, Throwable throwable eventCounter.scope("bulks_received").incrBy(1); eventCounter.scope("received").incrBy(request.operations().size()); receivedPerSecMetrics.scope("received").update(request.operations().size()); - LOG.error("Exception with bulk {} - failing the whole lot ", executionId, throwable); - - final var failedIds = - request.operations().stream() - .map(OpenSearchConnection::getBulkOperationId) - .filter(Objects::nonNull) - .collect(Collectors.toUnmodifiableSet()); - Map> failedTupleLists; - waitAckLock.lock(); - try { - failedTupleLists = waitAck.getAllPresent(failedIds); - if (!failedTupleLists.isEmpty()) { - waitAck.invalidateAll(failedTupleLists.keySet()); - } - } finally { - waitAckLock.unlock(); - } - for (var id : failedIds) { - var failedTuples = failedTupleLists.get(id); - if (failedTuples != null) { - LOG.debug("Failed {} tuple(s) for ID {}", failedTuples.size(), id); - for (Tuple x : failedTuples) { - // fail it + waitAck.processFailedBulk( + request, + executionId, + throwable, + t -> { eventCounter.scope("failed").incrBy(1); - collector.fail(x); - } - } else { - LOG.warn("Could not find unacked tuple for {}", id); - } - } + collector.fail(t); + }); } @Override diff --git a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java new file mode 100644 index 000000000..920ad5dea --- /dev/null +++ b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/DelegateRefresherTest.java @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.stormcrawler.JSONResource; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.filtering.URLFilter; +import org.apache.stormcrawler.parse.ParseFilter; +import org.apache.stormcrawler.parse.ParseResult; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; +import org.junit.jupiter.api.Test; +import org.w3c.dom.DocumentFragment; + +class DelegateRefresherTest { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + /** Minimal URLFilter + JSONResource implementation for testing. */ + public static class StubURLFilter extends URLFilter implements JSONResource { + + public final AtomicBoolean configured = new AtomicBoolean(false); + + @Override + public void configure(@NotNull Map stormConf, @NotNull JsonNode params) { + configured.set(true); + } + + @Override + public @Nullable String filter( + @Nullable URL sourceUrl, + @Nullable Metadata sourceMetadata, + @NotNull String urlToFilter) { + return urlToFilter; + } + + @Override + public String getResourceFile() { + return "stub.json"; + } + + @Override + public void loadJSONResources(InputStream inputStream) throws IOException {} + } + + /** Minimal ParseFilter + JSONResource implementation for testing. */ + public static class StubParseFilter extends ParseFilter implements JSONResource { + + public final AtomicBoolean configured = new AtomicBoolean(false); + + @Override + public void configure(@NotNull Map stormConf, @NotNull JsonNode params) { + configured.set(true); + } + + @Override + public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) {} + + @Override + public String getResourceFile() { + return "stub.json"; + } + + @Override + public void loadJSONResources(InputStream inputStream) throws IOException {} + } + + /** A URLFilter that does NOT implement JSONResource. */ + public static class NonJsonResourceURLFilter extends URLFilter { + + @Override + public @Nullable String filter( + @Nullable URL sourceUrl, + @Nullable Metadata sourceMetadata, + @NotNull String urlToFilter) { + return urlToFilter; + } + } + + /** Not a URLFilter at all. */ + public static class NotAFilter {} + + private JsonNode buildParams(String delegateClass) { + return buildParams(delegateClass, 600); + } + + private JsonNode buildParams(String delegateClass, int refreshRate) { + ObjectNode delegate = MAPPER.createObjectNode(); + delegate.put("class", delegateClass); + delegate.set("params", MAPPER.createObjectNode()); + + ObjectNode params = MAPPER.createObjectNode(); + params.set("delegate", delegate); + params.put("refresh", refreshRate); + return params; + } + + @Test + void loadsURLFilterDelegate() { + JsonNode params = buildParams(StubURLFilter.class.getName()); + Map conf = new HashMap<>(); + + DelegateRefresher refresher = + new DelegateRefresher<>( + URLFilter.class, conf, params, (d, c, p) -> d.configure(c, p)); + + try { + assertNotNull(refresher.getDelegate()); + assertInstanceOf(StubURLFilter.class, refresher.getDelegate()); + assertTrue(((StubURLFilter) refresher.getDelegate()).configured.get()); + } finally { + refresher.cleanup(); + } + } + + @Test + void loadsParseFilterDelegate() { + JsonNode params = buildParams(StubParseFilter.class.getName()); + Map conf = new HashMap<>(); + + DelegateRefresher refresher = + new DelegateRefresher<>( + ParseFilter.class, conf, params, (d, c, p) -> d.configure(c, p)); + + try { + assertNotNull(refresher.getDelegate()); + assertInstanceOf(StubParseFilter.class, refresher.getDelegate()); + assertTrue(((StubParseFilter) refresher.getDelegate()).configured.get()); + } finally { + refresher.cleanup(); + } + } + + @Test + void delegateFilterActuallyWorks() { + JsonNode params = buildParams(StubURLFilter.class.getName()); + Map conf = new HashMap<>(); + + DelegateRefresher refresher = + new DelegateRefresher<>( + URLFilter.class, conf, params, (d, c, p) -> d.configure(c, p)); + + try { + String result = refresher.getDelegate().filter(null, null, "http://example.com"); + assertEquals("http://example.com", result); + } finally { + refresher.cleanup(); + } + } + + @Test + void throwsWhenDelegateNodeMissing() { + ObjectNode params = MAPPER.createObjectNode(); + // no "delegate" key + Map conf = new HashMap<>(); + + assertThrows( + RuntimeException.class, + () -> + new DelegateRefresher<>( + URLFilter.class, conf, params, (d, c, p) -> d.configure(c, p))); + } + + @Test + void throwsWhenClassMissing() { + ObjectNode delegate = MAPPER.createObjectNode(); + // no "class" key + ObjectNode params = MAPPER.createObjectNode(); + params.set("delegate", delegate); + Map conf = new HashMap<>(); + + assertThrows( + RuntimeException.class, + () -> + new DelegateRefresher<>( + URLFilter.class, conf, params, (d, c, p) -> d.configure(c, p))); + } + + @Test + void throwsWhenClassDoesNotExtendBaseType() { + JsonNode params = buildParams(NotAFilter.class.getName()); + Map conf = new HashMap<>(); + + RuntimeException ex = + assertThrows( + RuntimeException.class, + () -> + new DelegateRefresher<>( + URLFilter.class, + conf, + params, + (d, c, p) -> d.configure(c, p))); + assertTrue(ex.getMessage().contains("does not extend")); + } + + @Test + void throwsWhenClassDoesNotImplementJSONResource() { + JsonNode params = buildParams(NonJsonResourceURLFilter.class.getName()); + Map conf = new HashMap<>(); + + RuntimeException ex = + assertThrows( + RuntimeException.class, + () -> + new DelegateRefresher<>( + URLFilter.class, + conf, + params, + (d, c, p) -> d.configure(c, p))); + assertTrue(ex.getMessage().contains("does not implement JSONResource")); + } + + @Test + void cleanupIsIdempotent() { + JsonNode params = buildParams(StubURLFilter.class.getName()); + Map conf = new HashMap<>(); + + DelegateRefresher refresher = + new DelegateRefresher<>( + URLFilter.class, conf, params, (d, c, p) -> d.configure(c, p)); + + // calling cleanup twice should not throw + refresher.cleanup(); + refresher.cleanup(); + } +} diff --git a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/WaitAckCacheTest.java b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/WaitAckCacheTest.java new file mode 100644 index 000000000..3bd2dc6d4 --- /dev/null +++ b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/WaitAckCacheTest.java @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch; + +import static org.awaitility.Awaitility.await; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.TimeUnit; +import org.apache.storm.tuple.Tuple; +import org.apache.stormcrawler.metrics.ScopedCounter; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.opensearch.client.opensearch._types.ErrorCause; +import org.opensearch.client.opensearch.core.BulkRequest; +import org.opensearch.client.opensearch.core.BulkResponse; +import org.opensearch.client.opensearch.core.bulk.BulkOperation; +import org.opensearch.client.opensearch.core.bulk.BulkResponseItem; +import org.opensearch.client.opensearch.core.bulk.OperationType; +import org.slf4j.LoggerFactory; + +class WaitAckCacheTest { + + private WaitAckCache cache; + private List evicted; + private List acked; + private List failed; + + @BeforeEach + void setUp() { + evicted = new CopyOnWriteArrayList<>(); + acked = new ArrayList<>(); + failed = new ArrayList<>(); + cache = new WaitAckCache(LoggerFactory.getLogger(WaitAckCacheTest.class), evicted::add); + } + + private Tuple mockTuple(String url) { + Tuple t = mock(Tuple.class); + when(t.getValueByField("url")).thenReturn(url); + when(t.getStringByField("url")).thenReturn(url); + return t; + } + + private static BulkResponseItem successItem(String docId) { + return BulkResponseItem.of( + b -> b.id(docId).index("index").status(200).operationType(OperationType.Index)); + } + + private static BulkResponseItem failedItem(String docId, int status) { + return BulkResponseItem.of( + b -> + b.id(docId) + .index("index") + .status(status) + .operationType(OperationType.Index) + .error( + ErrorCause.of( + e -> e.type("test_error").reason("test failure")))); + } + + private static BulkResponse bulkResponse(BulkResponseItem... items) { + boolean hasErrors = false; + for (BulkResponseItem item : items) { + if (item.error() != null) { + hasErrors = true; + break; + } + } + final boolean errors = hasErrors; + return BulkResponse.of(b -> b.took(10).errors(errors).items(List.of(items))); + } + + @Test + void addAndContains() { + Tuple t = mockTuple("http://example.com"); + assertFalse(cache.contains("doc1")); + + cache.addTuple("doc1", t); + assertTrue(cache.contains("doc1")); + assertEquals(1, cache.estimatedSize()); + } + + @Test + void invalidateRemovesEntry() { + Tuple t = mockTuple("http://example.com"); + cache.addTuple("doc1", t); + assertTrue(cache.contains("doc1")); + + cache.invalidate("doc1"); + assertFalse(cache.contains("doc1")); + } + + @Test + void processBulkResponse_successfulItem_ackedViaTupleAction() { + Tuple t = mockTuple("http://example.com"); + cache.addTuple("doc1", t); + + BulkResponse response = bulkResponse(successItem("doc1")); + + cache.processBulkResponse( + response, + 1L, + null, + (id, tuple, selected) -> { + if (!selected.failed()) { + acked.add(tuple); + } else { + failed.add(tuple); + } + }); + + assertEquals(1, acked.size()); + assertEquals(0, failed.size()); + assertSame(t, acked.get(0)); + assertFalse(cache.contains("doc1")); + } + + @Test + void processBulkResponse_failedItem_failedViaTupleAction() { + Tuple t = mockTuple("http://example.com"); + cache.addTuple("doc1", t); + + BulkResponse response = bulkResponse(failedItem("doc1", 500)); + + cache.processBulkResponse( + response, + 1L, + null, + (id, tuple, selected) -> { + if (!selected.failed()) { + acked.add(tuple); + } else { + failed.add(tuple); + } + }); + + assertEquals(0, acked.size()); + assertEquals(1, failed.size()); + assertSame(t, failed.get(0)); + } + + @Test + void processBulkResponse_conflictIsNotAFailure() { + Tuple t = mockTuple("http://example.com"); + cache.addTuple("doc1", t); + + ScopedCounter counter = mock(ScopedCounter.class); + ScopedCounter.CountHandle handle = mock(ScopedCounter.CountHandle.class); + when(counter.scope("doc_conflicts")).thenReturn(handle); + + BulkResponse response = bulkResponse(failedItem("doc1", 409)); + + cache.processBulkResponse( + response, + 1L, + counter, + (id, tuple, selected) -> { + if (!selected.failed()) { + acked.add(tuple); + } else { + failed.add(tuple); + } + }); + + assertEquals(1, acked.size()); + assertEquals(0, failed.size()); + verify(handle).incrBy(1); + } + + @Test + void processBulkResponse_multipleTuplesForSameDocId() { + Tuple t1 = mockTuple("http://example.com/1"); + Tuple t2 = mockTuple("http://example.com/2"); + cache.addTuple("doc1", t1); + cache.addTuple("doc1", t2); + + BulkResponse response = bulkResponse(successItem("doc1")); + + cache.processBulkResponse(response, 1L, null, (id, tuple, selected) -> acked.add(tuple)); + + assertEquals(2, acked.size()); + assertTrue(acked.contains(t1)); + assertTrue(acked.contains(t2)); + } + + @Test + void processBulkResponse_duplicateDocIdInBulk_prefersSuccess() { + // https://github.com/apache/stormcrawler/issues/832 + Tuple t = mockTuple("http://example.com"); + cache.addTuple("doc1", t); + + BulkResponse response = bulkResponse(failedItem("doc1", 500), successItem("doc1")); + + cache.processBulkResponse( + response, + 1L, + null, + (id, tuple, selected) -> { + if (!selected.failed()) { + acked.add(tuple); + } else { + failed.add(tuple); + } + }); + + assertEquals(1, acked.size()); + assertEquals(0, failed.size()); + } + + @Test + void processFailedBulk_failsAllMatchingTuples() { + Tuple t1 = mockTuple("http://example.com/1"); + Tuple t2 = mockTuple("http://example.com/2"); + cache.addTuple("doc1", t1); + cache.addTuple("doc2", t2); + + BulkRequest request = + BulkRequest.of( + b -> + b.operations( + BulkOperation.of( + o -> o.delete(d -> d.index("index").id("doc1"))), + BulkOperation.of( + o -> o.delete(d -> d.index("index").id("doc2"))))); + + cache.processFailedBulk(request, 1L, new Exception("connection lost"), failed::add); + + assertEquals(2, failed.size()); + assertTrue(failed.contains(t1)); + assertTrue(failed.contains(t2)); + assertFalse(cache.contains("doc1")); + assertFalse(cache.contains("doc2")); + } + + @Test + void processFailedBulk_ignoresMissingIds() { + Tuple t = mockTuple("http://example.com"); + cache.addTuple("doc1", t); + + BulkRequest request = + BulkRequest.of( + b -> + b.operations( + BulkOperation.of( + o -> + o.delete( + d -> + d.index("index") + .id( + "doc_unknown"))))); + + cache.processFailedBulk(request, 1L, new Exception("test"), failed::add); + + assertEquals(0, failed.size()); + // doc1 should still be in cache since it wasn't in the failed request + assertTrue(cache.contains("doc1")); + } + + @Test + void eviction_failsTuplesOnExpiry() { + cache = + new WaitAckCache( + "expireAfterWrite=1s", + LoggerFactory.getLogger(WaitAckCacheTest.class), + evicted::add); + Tuple t = mockTuple("http://example.com"); + cache.addTuple("doc1", t); + + // Force cache maintenance after expiry by doing a contains() check + // which accesses the cache and triggers Caffeine's cleanup + await().atMost(5, TimeUnit.SECONDS) + .pollInterval(200, TimeUnit.MILLISECONDS) + .untilAsserted( + () -> { + // contains() accesses the cache which triggers cleanup + cache.contains("doc1"); + // also try adding and invalidating a dummy entry to force maintenance + Tuple dummy = mockTuple("http://dummy"); + cache.addTuple("_probe_", dummy); + cache.invalidate("_probe_"); + assertFalse(evicted.isEmpty(), "Eviction callback should have fired"); + }); + + assertTrue(evicted.contains(t)); + } + + @Test + void processBulkResponse_multipleDocIds() { + Tuple t1 = mockTuple("http://example.com/1"); + Tuple t2 = mockTuple("http://example.com/2"); + cache.addTuple("doc1", t1); + cache.addTuple("doc2", t2); + + BulkResponse response = bulkResponse(successItem("doc1"), failedItem("doc2", 500)); + + cache.processBulkResponse( + response, + 1L, + null, + (id, tuple, selected) -> { + if (!selected.failed()) { + acked.add(tuple); + } else { + failed.add(tuple); + } + }); + + assertEquals(1, acked.size()); + assertSame(t1, acked.get(0)); + assertEquals(1, failed.size()); + assertSame(t2, failed.get(0)); + } +} diff --git a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java index 414d1b984..929ae5c11 100644 --- a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java +++ b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java @@ -32,7 +32,7 @@ import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; -import org.apache.http.HttpHost; +import org.apache.hc.core5.http.HttpHost; import org.apache.storm.task.OutputCollector; import org.apache.storm.tuple.Tuple; import org.apache.stormcrawler.Metadata; @@ -46,11 +46,11 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Timeout; -import org.opensearch.client.RestClient; import org.opensearch.client.json.jackson.JacksonJsonpMapper; import org.opensearch.client.opensearch.OpenSearchClient; import org.opensearch.client.opensearch.core.GetResponse; -import org.opensearch.client.transport.rest_client.RestClientTransport; +import org.opensearch.client.transport.OpenSearchTransport; +import org.opensearch.client.transport.httpclient5.ApacheHttpClient5TransportBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -62,7 +62,7 @@ class StatusBoltTest extends AbstractOpenSearchTest { protected OpenSearchClient client; - private RestClient restClient; + private OpenSearchTransport transport; private static final Logger LOG = LoggerFactory.getLogger(StatusBoltTest.class); @@ -82,14 +82,14 @@ static void afterClass() { @BeforeEach void setupStatusBolt() throws IOException { bolt = new StatusUpdaterBolt(); - restClient = - RestClient.builder( + transport = + ApacheHttpClient5TransportBuilder.builder( new HttpHost( + "http", opensearchContainer.getHost(), opensearchContainer.getMappedPort(9200))) + .setMapper(new JacksonJsonpMapper()) .build(); - RestClientTransport transport = - new RestClientTransport(restClient, new JacksonJsonpMapper()); client = new OpenSearchClient(transport); // configure the status updater bolt Map conf = new HashMap<>(); @@ -111,7 +111,7 @@ void close() { bolt.cleanup(); output = null; try { - restClient.close(); + transport.close(); } catch (IOException e) { } } diff --git a/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/metrics/MetricsReporterTest.java b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/metrics/MetricsReporterTest.java new file mode 100644 index 000000000..8c0c9413e --- /dev/null +++ b/external/opensearch-java/src/test/java/org/apache/stormcrawler/opensearch/metrics/MetricsReporterTest.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.opensearch.metrics; + +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import com.codahale.metrics.Counter; +import com.codahale.metrics.MetricRegistry; +import java.util.HashMap; +import java.util.Map; +import org.apache.stormcrawler.opensearch.bolt.AbstractOpenSearchTest; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; + +class MetricsReporterTest extends AbstractOpenSearchTest { + + @Test + @Timeout(60) + void prepareAndReportMetrics() { + MetricRegistry registry = new MetricRegistry(); + Counter counter = registry.counter("test.counter"); + counter.inc(42); + + Map topoConf = new HashMap<>(); + topoConf.put( + "opensearch.metrics.addresses", + opensearchContainer.getHost() + ":" + opensearchContainer.getFirstMappedPort()); + + Map reporterConf = new HashMap<>(); + reporterConf.put("report.period", 60L); + reporterConf.put("report.period.units", "SECONDS"); + + MetricsReporter reporter = new MetricsReporter(); + assertDoesNotThrow(() -> reporter.prepare(registry, topoConf, reporterConf)); + assertNotNull(reporter); + reporter.stop(); + } +} diff --git a/pom.xml b/pom.xml index 0e0d7daa9..ac09ce6dc 100644 --- a/pom.xml +++ b/pom.xml @@ -559,7 +559,6 @@ under the License. CONTRIBUTING.md RELEASING.md external/opensearch/dashboards/** - external/opensearch-java/dashboards/** external/solr/archetype/src/main/resources/archetype-resources/configsets/** THIRD-PARTY.properties THIRD-PARTY.txt @@ -730,7 +729,6 @@ under the License. external/warc archetype external/opensearch/archetype - external/opensearch-java/archetype external/solr/archetype docs From 482512ce057b54dcc7b2ca5ce7b078561d683f4e Mon Sep 17 00:00:00 2001 From: Davide Polato Date: Sat, 11 Apr 2026 12:05:22 +0200 Subject: [PATCH 4/4] Address reviewer feedback --- external/opensearch-java/README.md | 16 ++++++++++ .../opensearch/OpenSearchConnection.java | 30 +++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/external/opensearch-java/README.md b/external/opensearch-java/README.md index 080eef36d..2cedc3694 100644 --- a/external/opensearch-java/README.md +++ b/external/opensearch-java/README.md @@ -45,3 +45,19 @@ For a ready-to-use crawler configuration, example Flux topologies, index initialization scripts and OpenSearch Dashboards exports, refer to the [`external/opensearch`](../opensearch) module: all of those resources are compatible with this module and have not been duplicated here. + +Differences from the legacy `external/opensearch` module +--------------------- + +* `opensearch..responseBufferSize` is no longer supported. The legacy + module used the HC4-based low-level REST client and set a heap response + buffer via `HeapBufferedResponseConsumerFactory`. The HC5-based async + transport used here does not expose an equivalent per-request override, so + the key is ignored. A `WARN` is logged at startup if it is found in the + configuration; remove it when migrating. +* `opensearch..sniff` is no longer supported. The legacy module enabled + node auto-discovery by default via the low-level REST client `Sniffer`. The + OpenSearch Java Client 3.x does not ship a sniffer equivalent, so this + feature is dropped. Keep the `addresses` list up to date manually or put a + load balancer in front of the cluster. A `WARN` is logged at startup if the + key is found in the configuration; remove it when migrating. diff --git a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java index 0d8675398..4c31a74a2 100644 --- a/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java +++ b/external/opensearch-java/src/main/java/org/apache/stormcrawler/opensearch/OpenSearchConnection.java @@ -123,6 +123,8 @@ public static OpenSearchConnection getConnection( final String dottedType = boltType + "."; + warnOnRemovedKeys(stormConf, dottedType); + ClientResources cr = buildClientResources(stormConf, boltType); final String flushIntervalString = @@ -218,6 +220,34 @@ public static String getBulkOperationId(BulkOperation op) { // internal helpers private record ClientResources(OpenSearchClient client, OpenSearchTransport transport) {} + /** + * Logs a WARN for legacy configuration keys that are no longer honoured by this module, so that + * users migrating from {@code external/opensearch} notice silently-dropped tuning. See the + * module README for the full list of differences. + */ + private static void warnOnRemovedKeys(Map stormConf, String dottedType) { + final String responseBufferKey = Constants.PARAMPREFIX + dottedType + "responseBufferSize"; + if (stormConf.containsKey(responseBufferKey)) { + LOG.warn( + "Configuration key '{}' is set but no longer supported by the opensearch-java module. " + + "The HC5-based async transport does not expose an equivalent per-request " + + "heap-buffer override. The setting is ignored — remove it from your " + + "configuration. See external/opensearch-java/README.md for details.", + responseBufferKey); + } + + final String sniffKey = Constants.PARAMPREFIX + dottedType + "sniff"; + if (stormConf.containsKey(sniffKey)) { + LOG.warn( + "Configuration key '{}' is set but no longer supported by the opensearch-java module. " + + "The OpenSearch Java Client 3.x does not ship a Sniffer equivalent, so " + + "automatic node discovery is not available. Keep the 'addresses' list up to " + + "date manually or put a load balancer in front of the cluster. " + + "See external/opensearch-java/README.md for details.", + sniffKey); + } + } + private static ClientResources buildClientResources( Map stormConf, String boltType) {