diff --git a/external/elasticsearch/README.md b/external/elasticsearch/README.md index a8cfe1a92..44b14e8c2 100644 --- a/external/elasticsearch/README.md +++ b/external/elasticsearch/README.md @@ -1,70 +1,6 @@ storm-crawler-elasticsearch =========================== -A collection of resources for [Elasticsearch](https://www.elastic.co/products/elasticsearch): -* [IndexerBolt](https://github.org/apache/incubator-stormcrawler/blob/master/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/bolt/IndexerBolt.java) for indexing documents crawled with StormCrawler -* [Spouts](https://github.org/apache/incubator-stormcrawler/blob/master/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/AggregationSpout.java) and [StatusUpdaterBolt](https://github.org/apache/incubator-stormcrawler/blob/master/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/StatusUpdaterBolt.java) for persisting URL information in recursive crawls -* [MetricsConsumer](https://github.org/apache/incubator-stormcrawler/blob/master/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/metrics/MetricsConsumer.java) -* [StatusMetricsBolt](https://github.org/apache/incubator-stormcrawler/blob/master/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/metrics/StatusMetricsBolt.java) for sending the breakdown of URLs per status as metrics and display its evolution over time. - -as well as an archetype containing a basic crawl topology and its configuration. - -We also have resources for [Kibana](https://www.elastic.co/products/kibana) to build basic real-time monitoring dashboards for the crawls. A dashboard for [Grafana](http://grafana.com/) is also [available](https://grafana.com/dashboards/2363). - -Getting started ---------------------- - -Use the archetype for Elasticsearch with: - -`mvn archetype:generate -DarchetypeGroupId=org.apache.stormcrawler -DarchetypeArtifactId=storm-crawler-elasticsearch-archetype -DarchetypeVersion=2.11` - -You'll be asked to enter a groupId (e.g. com.mycompany.crawler), an artefactId (e.g. stormcrawler), a version, a package name and details about the user agent to use. - -This will not only create a fully formed project containing a POM with the dependency above but also a set of resources, configuration files and a topology class. Enter the directory you just created (should be the same as the artefactId you specified earlier) and follow the instructions on the README file. - -Video tutorial ---------------------- - -[![Video tutorial](https://i.ytimg.com/vi/8kpJLPdhvLw/hqdefault.jpg)](https://youtu.be/8kpJLPdhvLw) - - -Kibana ---------------------- - -To import the dashboards into a local instance of Kibana, go into the folder _kibana_ and run the script _importKibana.sh_. - -You should see something like - -``` -Importing status dashboard into Kibana -{"success":true,"successCount":4} -Importing metrics dashboard into Kibana -{"success":true,"successCount":9} -``` - -The [dashboard screen](http://localhost:5601/app/kibana#/dashboards) should show both the status and metrics dashboards. If you click on `Crawl Status`, you should see 2 tables containing the count of URLs per status and the top hostnames per URL count. -The [Metrics dashboard](http://localhost:5601/app/kibana#/dashboard/Crawl-metrics) can be used to monitor the progress of the crawl. - -The file _storm.ndjson_ is used to display some of Storm's internal metrics and is not added by default. - -#### Per time period metric indices (optional) - -The _metrics_ index can be configured per tine period. This best practice is [discussed on the Elastic website](https://www.elastic.co/guide/en/elasticsearch/guide/current/time-based.html). - -The crawler config YAML must be updated to use an optional argument as shown below to have one index per day: - -``` - #Metrics consumers: - topology.metrics.consumer.register: - - class: "org.apache.stormcrawler.elasticsearch.metrics.MetricsConsumer" - parallelism.hint: 1 - argument: "yyyy-MM-dd" -``` - - - - - - - - +Elasticsearch has changed its licence to `Elastic License 2.0` in recent releases, which is not an ASFv2 compliant licence. +Therefore, the corresponding module was removed. Users can migrate to the `opensearch` module and follow the +[opensearch upgrade guideline](https://opensearch.org/docs/latest/upgrade-to/upgrade-to/) to migrate existing elasticsearch installations. diff --git a/external/elasticsearch/archetype/pom.xml b/external/elasticsearch/archetype/pom.xml deleted file mode 100644 index 499173ee7..000000000 --- a/external/elasticsearch/archetype/pom.xml +++ /dev/null @@ -1,72 +0,0 @@ - - - - - - 4.0.0 - - - org.apache.stormcrawler - storm-crawler - 2.12-SNAPSHOT - ../../../pom.xml - - - storm-crawler-elasticsearch-archetype - - maven-archetype - - - - - - src/main/resources - true - - META-INF/maven/archetype-metadata.xml - - - - src/main/resources - false - - META-INF/maven/archetype-metadata.xml - - - - - - - org.apache.maven.archetype - archetype-packaging - 3.1.2 - - - - - - - maven-archetype-plugin - 3.2.1 - - - - - diff --git a/external/elasticsearch/archetype/src/main/resources/META-INF/archetype-post-generate.groovy b/external/elasticsearch/archetype/src/main/resources/META-INF/archetype-post-generate.groovy deleted file mode 100644 index 499d22941..000000000 --- a/external/elasticsearch/archetype/src/main/resources/META-INF/archetype-post-generate.groovy +++ /dev/null @@ -1,5 +0,0 @@ -def file = new File(request.getOutputDirectory(), request.getArtifactId() + "/ES_IndexInit.sh") -file.setExecutable(true, false) - -def file2 = new File(request.getOutputDirectory(), request.getArtifactId() + "/kibana/importKibana.sh") -file2.setExecutable(true, false) diff --git a/external/elasticsearch/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml b/external/elasticsearch/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml deleted file mode 100644 index c3a0c6d2f..000000000 --- a/external/elasticsearch/archetype/src/main/resources/META-INF/maven/archetype-metadata.xml +++ /dev/null @@ -1,77 +0,0 @@ - - - - - - - - - ^[a-zA-Z_\-]+$ - - - - - - ^\S+@\S+\.\S+$ - - - ${project.version} - - - - - - src/main/java - - **/*.java - - - - src/main/resources - - **/*.xml - **/*.txt - **/*.yaml - **/*.json - - - - - - README.md - ES_IndexInit.sh - *.flux - *.yaml - - - - kibana - - *.sh - *.ndjson - - - - - diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/ES_IndexInit.sh b/external/elasticsearch/archetype/src/main/resources/archetype-resources/ES_IndexInit.sh deleted file mode 100755 index 4b6a75ca3..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/ES_IndexInit.sh +++ /dev/null @@ -1,173 +0,0 @@ -ESHOST=${1:-"http://localhost:9200"} -ESCREDENTIALS=${2:-"-u elastic:passwordhere"} - -# deletes and recreates a status index with a bespoke schema - -curl $ESCREDENTIALS -s -XDELETE "$ESHOST/status/" > /dev/null - -echo "Deleted status index" - -# http://localhost:9200/status/_mapping/status?pretty - -echo "Creating status index with mapping" - -curl $ESCREDENTIALS -s -XPUT $ESHOST/status -H 'Content-Type: application/json' -d ' -{ - "settings": { - "index": { - "number_of_shards": 10, - "number_of_replicas": 1, - "refresh_interval": "5s" - } - }, - "mappings": { - "dynamic_templates": [{ - "metadata": { - "path_match": "metadata.*", - "match_mapping_type": "string", - "mapping": { - "type": "keyword" - } - } - }], - "_source": { - "enabled": true - }, - "properties": { - "key": { - "type": "keyword", - "index": true - }, - "nextFetchDate": { - "type": "date", - "format": "date_optional_time" - }, - "status": { - "type": "keyword" - }, - "url": { - "type": "keyword" - } - } - } -}' - -# deletes and recreates a status index with a bespoke schema - -curl $ESCREDENTIALS -s -XDELETE "$ESHOST/metrics*/" > /dev/null - -echo "" -echo "Deleted metrics index" - -curl $ESCREDENTIALS -s -XPUT $ESHOST/_ilm/policy/7d-deletion_policy -H 'Content-Type:application/json' -d ' -{ - "policy": { - "phases": { - "delete": { - "min_age": "7d", - "actions": { - "delete": {} - } - } - } - } -} -' - -echo "Creating metrics index with mapping" - -# http://localhost:9200/metrics/_mapping/status?pretty -curl $ESCREDENTIALS -s -XPOST $ESHOST/_template/storm-metrics-template -H 'Content-Type: application/json' -d ' -{ - "index_patterns": "metrics*", - "settings": { - "index": { - "number_of_shards": 1, - "refresh_interval": "30s" - }, - "number_of_replicas": 0, - "lifecycle.name": "7d-deletion_policy" - }, - "mappings": { - "_source": { "enabled": true }, - "properties": { - "name": { - "type": "keyword" - }, - "stormId": { - "type": "keyword" - }, - "srcComponentId": { - "type": "keyword" - }, - "srcTaskId": { - "type": "short" - }, - "srcWorkerHost": { - "type": "keyword" - }, - "srcWorkerPort": { - "type": "integer" - }, - "timestamp": { - "type": "date", - "format": "date_optional_time" - }, - "value": { - "type": "double" - } - } - } -}' - -# deletes and recreates a doc index with a bespoke schema - -curl $ESCREDENTIALS -s -XDELETE "$ESHOST/content*/" > /dev/null - -echo "" -echo "Deleted content index" - -echo "Creating content index with mapping" - -curl $ESCREDENTIALS -s -XPUT $ESHOST/content -H 'Content-Type: application/json' -d ' -{ - "settings": { - "index": { - "number_of_shards": 5, - "number_of_replicas": 1, - "refresh_interval": "60s" - } - }, - "mappings": { - "_source": { - "enabled": true - }, - "properties": { - "content": { - "type": "text" - }, - "description": { - "type": "text" - }, - "domain": { - "type": "keyword" - }, - "format": { - "type": "keyword" - }, - "keywords": { - "type": "keyword" - }, - "host": { - "type": "keyword" - }, - "title": { - "type": "text" - }, - "url": { - "type": "keyword" - } - } - } -}' - diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/README.md b/external/elasticsearch/archetype/src/main/resources/archetype-resources/README.md deleted file mode 100644 index 27e9183e2..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/README.md +++ /dev/null @@ -1,62 +0,0 @@ -This has been generated by the StormCrawler Maven Archetype as a starting point for building your own crawler with Elasticsearch as a backend. -Have a look at the code and resources and modify them to your heart's content. - -First generate an uberjar: - -``` sh -mvn clean package -``` - -then with Elasticsearch running locally, run `./ES_IndexInit.sh` to define the indices used by StormCrawler. - -The first step consists in creating a file _seeds.txt_ in the current directory and populating it with the URLs -to be used as a starting point for the crawl, e.g. - -`echo "http://stormcrawler.net/" > seeds.txt` - -You can start the crawl topology using the Java class - -``` sh -storm local target/${artifactId}-${version}.jar --local-ttl 3600 ${package}.ESCrawlTopology -- -conf crawler-conf.yaml -conf es-conf.yaml . seeds.txt -``` - -This will run the topology in local mode for 1 hour, using the URLs in _seeds.txt_ as a starting point. To start the topology in distributed mode, where it will run indefinitely, launch it with 'storm jar'. - -Alternatively, you can also use Flux to do the same but the injection is separated from the crawl: - -``` sh - -storm local target/${artifactId}-${version}.jar org.apache.storm.flux.Flux es-injection.flux --local-ttl 3600 - -storm local target/${artifactId}-${version}.jar org.apache.storm.flux.Flux es-crawler.flux --local-ttl 3600 -``` - -Note that in local mode, Flux uses a default TTL for the topology of 20 secs. The command above runs the topology for 1 hour. - -It is best to run the topology with `storm jar` to benefit from the Storm UI and logging. In that case, the topology runs continuously, as intended. - -Kibana ---------------------- - -To import the dashboards into a local instance of Kibana, go into the folder _kibana_ and run the script _importKibana.sh_. - -You should see something like - -``` -Importing status dashboard into Kibana -{"success":true,"successCount":4} -Importing metrics dashboard into Kibana -{"success":true,"successCount":9} -``` - -The [dashboard screen](http://localhost:5601/app/kibana#/dashboards) should show both the status and metrics dashboards. If you click on `Crawl Status`, you should see 2 tables containing the count of URLs per status and the top hostnames per URL count. -The [Metrics dashboard](http://localhost:5601/app/kibana#/dashboard/Crawl-metrics) can be used to monitor the progress of the crawl. - -The file _storm.ndjson_ is used to display some of Storm's internal metrics and is not added by default. - - - -Happy crawling! If you have any questions, please ask on [StackOverflow with the tag stormcrawler](http://stackoverflow.com/questions/tagged/stormcrawler). - - - diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml b/external/elasticsearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml deleted file mode 100644 index d21929e71..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml +++ /dev/null @@ -1,145 +0,0 @@ -# Custom configuration for StormCrawler -# This is used to override the default values from crawler-default.xml and provide additional ones -# for your custom components. -# Use this file with the parameter -conf when launching your extension of ConfigurableTopology. -# This file does not contain all the key values but only the most frequently used ones. See crawler-default.xml for an extensive list. - -config: - topology.workers: 1 - topology.message.timeout.secs: 300 - topology.max.spout.pending: 100 - topology.debug: false - - fetcher.threads.number: 50 - - # override the JVM parameters for the workers - topology.worker.childopts: "-Xmx2g -Djava.net.preferIPv4Stack=true" - - # mandatory when using Flux - topology.kryo.register: - - org.apache.stormcrawler.Metadata - - org.apache.stormcrawler.persistence.Status - - # Lists the metadata to transfer to outlinks - # Used by Fetcher and SiteMapParser for redirections, - # discovered links, passing cookies to child pages, etc. - # These are also persisted for the parent document (see below). - # Allows wildcards, eg. "follow.*" transfers all metadata starting with "follow.". - # metadata.transfer: - # - customMetadataName - - # Lists the metadata to persist to storage - # These are not transferred to the outlinks. Also allows wildcards, eg. "follow.*". - metadata.persist: - - _redirTo - - error.cause - - error.source - - isSitemap - - isFeed - - # Agent name info - given here as an example. Do not be an anonynmous coward, use your real information! - # The full user agent value sent as part of the HTTP requests - # is built from the elements below. Only the agent.name is mandatory, - # it is also used to parse the robots.txt directives. - - # The agent name must be compliant with RFC 9309 (section 2.2.1) - # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-") - http.agent.name: "${http-agent-name}" - # version of your crawler - http.agent.version: "${http-agent-version}" - # description of what it does - http.agent.description: "${http-agent-description}" - # URL webmasters can go to to learn about it - http.agent.url: "${http-agent-url}" - # Finally, an email so that they can get in touch with you - http.agent.email: "${http-agent-email}" - - http.protocol.implementation: "org.apache.stormcrawler.protocol.okhttp.HttpProtocol" - https.protocol.implementation: "org.apache.stormcrawler.protocol.okhttp.HttpProtocol" - - # The maximum number of bytes for returned HTTP response bodies. - # The fetched page will be trimmed to 65KB in this case - # Set -1 to disable the limit. - http.content.limit: 65536 - - sitemap.discovery: true - - # FetcherBolt queue dump => comment out to activate - # if a file exists on the worker machine with the corresponding port number - # the FetcherBolt will log the content of its internal queues to the logs - # fetcherbolt.queue.debug.filepath: "/tmp/fetcher-dump-{port}" - - parsefilters.config.file: "parsefilters.json" - urlfilters.config.file: "urlfilters.json" - jsoup.filters.config.file: "jsoupfilters.json" - - # revisit a page daily (value in minutes) - # set it to -1 to never refetch a page - fetchInterval.default: 1440 - - # revisit a page with a fetch error after 2 hours (value in minutes) - # set it to -1 to never refetch a page - fetchInterval.fetch.error: 120 - - # never revisit a page with an error (or set a value in minutes) - fetchInterval.error: -1 - - # set to true if you don't need any text to be extracted by JSoup - textextractor.no.text: false - - # text extraction for JSoupParserBolt - textextractor.include.pattern: - - DIV[id="maincontent"] - - DIV[itemprop="articleBody"] - - ARTICLE - - textextractor.exclude.tags: - - STYLE - - SCRIPT - - # needed for parsing with Tika - jsoup.treat.non.html.as.error: false - - # restricts the documents types to be parsed with Tika - parser.mimetype.whitelist: - - application/.+word.* - - application/.+excel.* - - application/.+powerpoint.* - - application/.*pdf.* - - # Tika parser configuration file - parse.tika.config.file: "tika-config.xml" - - # custom fetch interval to be used when a document has the key/value in its metadata - # and has been fetched successfully (value in minutes) - # fetchInterval.FETCH_ERROR.isFeed=true: 30 - # fetchInterval.isFeed=true: 10 - - # configuration for the classes extending AbstractIndexerBolt - # indexer.md.filter: "someKey=aValue" - indexer.url.fieldname: "url" - indexer.text.fieldname: "content" - indexer.canonical.name: "canonical" - # How to convert metadata key values into fields for indexing - # - # if no alias is specified with =alias, the key value is used - # for instance below, _domain_ and _format_ will be used - # as field names, whereas _title_ will be used for _parse.title_. - # You can specify the index of the value to store from the values array - # by using the _key[index]_ format, e.g. _parse.title[0]_ would try to - # get the first value for the metadata _parse.title_ (which is the default anyway). - # Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would - # index all the keys with _parse_ as a prefix. Note that in that case, you can't - # specify an alias with =, nor can you specify an index. - indexer.md.mapping: - - parse.title=title - - parse.keywords=keywords - - parse.description=description - - domain - - format - - # Metrics consumers: - topology.metrics.consumer.register: - - class: "org.apache.storm.metric.LoggingMetricsConsumer" - parallelism.hint: 1 - diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/es-conf.yaml b/external/elasticsearch/archetype/src/main/resources/archetype-resources/es-conf.yaml deleted file mode 100644 index 848173f5f..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/es-conf.yaml +++ /dev/null @@ -1,98 +0,0 @@ -# configuration for Elasticsearch resources - -config: - # ES indexer bolt - # addresses can be specified as a full URL - # if not we assume that the protocol is http and the port 9200 - es.indexer.addresses: "localhost" - es.indexer.index.name: "content" - # es.indexer.pipeline: "_PIPELINE_" - es.indexer.create: false - es.indexer.bulkActions: 100 - es.indexer.flushInterval: "2s" - es.indexer.concurrentRequests: 1 - - # allows to use the Rest client on ES8+ - es.indexer.compatibility.mode: false - - # ES metricsConsumer - es.metrics.addresses: "http://localhost:9200" - es.metrics.index.name: "metrics" - - # allows to use the Rest client on ES8+ - es.metrics.compatibility.mode: false - - # ES spout and persistence bolt - es.status.addresses: "http://localhost:9200" - es.status.index.name: "status" - #es.status.user: "USERNAME" - #es.status.password: "PASSWORD" - # the routing is done on the value of 'partition.url.mode' - es.status.routing: true - # stores the value used for grouping the URLs as a separate field - # needed by the spout implementations - # also used for routing if the value above is set to true - es.status.routing.fieldname: "key" - es.status.bulkActions: 500 - es.status.flushInterval: "5s" - es.status.concurrentRequests: 1 - - # allows to use the Rest client on ES8+ - es.status.compatibility.mode: false - - ################ - # spout config # - ################ - - # positive or negative filters parsable by the Lucene Query Parser - # es.status.filterQuery: - # - "-(key:stormcrawler.net)" - # - "-(key:digitalpebble.com)" - - # time in secs for which the URLs will be considered for fetching after a ack or fail - spout.ttl.purgatory: 30 - - # Min time (in msecs) to allow between 2 successive queries to ES - spout.min.delay.queries: 2000 - - # Max time (in msecs) to allow between 2 successive queries to ES - spout.max.delay.queries: 20000 - - # Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time - # Setting this to -1 or a large value means that the ES will cache the results but also that less and less results - # might be returned. - spout.reset.fetchdate.after: 120 - - es.status.max.buckets: 50 - es.status.max.urls.per.bucket: 2 - # field to group the URLs into buckets - es.status.bucket.field: "key" - # fields to sort the URLs within a bucket - es.status.bucket.sort.field: - - "nextFetchDate" - - "url" - # field to sort the buckets - es.status.global.sort.field: "nextFetchDate" - - # CollapsingSpout : limits the deep paging by resetting the start offset for the ES query - es.status.max.start.offset: 500 - - # AggregationSpout : sampling improves the performance on large crawls - es.status.sample: false - - # max allowed duration of a query in sec - es.status.query.timeout: -1 - - # AggregationSpout (expert): adds this value in mins to the latest date returned in the results and - # use it as nextFetchDate - es.status.recentDate.increase: -1 - es.status.recentDate.min.gap: -1 - - topology.metrics.consumer.register: - - class: "org.apache.stormcrawler.elasticsearch.metrics.MetricsConsumer" - parallelism.hint: 1 - #whitelist: - # - "fetcher_counter" - # - "fetcher_average.bytes_fetched" - #blacklist: - # - "__receive.*" diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/es-crawler.flux b/external/elasticsearch/archetype/src/main/resources/archetype-resources/es-crawler.flux deleted file mode 100644 index 92d596725..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/es-crawler.flux +++ /dev/null @@ -1,141 +0,0 @@ -name: "crawler" - -includes: - - resource: true - file: "/crawler-default.yaml" - override: false - - - resource: false - file: "crawler-conf.yaml" - override: true - - - resource: false - file: "es-conf.yaml" - override: true - -spouts: - - id: "spout" - className: "org.apache.stormcrawler.elasticsearch.persistence.AggregationSpout" - parallelism: 10 - -bolts: - - id: "partitioner" - className: "org.apache.stormcrawler.bolt.URLPartitionerBolt" - parallelism: 1 - - id: "fetcher" - className: "org.apache.stormcrawler.bolt.FetcherBolt" - parallelism: 1 - - id: "sitemap" - className: "org.apache.stormcrawler.bolt.SiteMapParserBolt" - parallelism: 1 - - id: "parse" - className: "org.apache.stormcrawler.bolt.JSoupParserBolt" - parallelism: 1 - - id: "shunt" - className: "org.apache.stormcrawler.tika.RedirectionBolt" - parallelism: 1 - - id: "tika" - className: "org.apache.stormcrawler.tika.ParserBolt" - parallelism: 1 - - id: "index" - className: "org.apache.stormcrawler.elasticsearch.bolt.IndexerBolt" - parallelism: 1 - - id: "status" - className: "org.apache.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt" - parallelism: 1 - - id: "deleter" - className: "org.apache.stormcrawler.elasticsearch.bolt.DeletionBolt" - parallelism: 1 - - id: "status_metrics" - className: "org.apache.stormcrawler.elasticsearch.metrics.StatusMetricsBolt" - parallelism: 1 - -streams: - - from: "spout" - to: "partitioner" - grouping: - type: SHUFFLE - - - from: "__system" - to: "status_metrics" - grouping: - type: SHUFFLE - streamId: "__tick" - - - from: "partitioner" - to: "fetcher" - grouping: - type: FIELDS - args: ["key"] - - - from: "fetcher" - to: "sitemap" - grouping: - type: LOCAL_OR_SHUFFLE - - - from: "sitemap" - to: "parse" - grouping: - type: LOCAL_OR_SHUFFLE - - - from: "parse" - to: "shunt" - grouping: - type: LOCAL_OR_SHUFFLE - - - from: "shunt" - to: "tika" - grouping: - type: LOCAL_OR_SHUFFLE - streamId: "tika" - - - from: "tika" - to: "index" - grouping: - type: LOCAL_OR_SHUFFLE - - - from: "shunt" - to: "index" - grouping: - type: LOCAL_OR_SHUFFLE - - - from: "fetcher" - to: "status" - grouping: - type: FIELDS - args: ["url"] - streamId: "status" - - - from: "sitemap" - to: "status" - grouping: - type: FIELDS - args: ["url"] - streamId: "status" - - - from: "parse" - to: "status" - grouping: - type: FIELDS - args: ["url"] - streamId: "status" - - - from: "tika" - to: "status" - grouping: - type: FIELDS - args: ["url"] - streamId: "status" - - - from: "index" - to: "status" - grouping: - type: FIELDS - args: ["url"] - streamId: "status" - - - from: "status" - to: "deleter" - grouping: - type: LOCAL_OR_SHUFFLE - streamId: "deletion" diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/es-injection.flux b/external/elasticsearch/archetype/src/main/resources/archetype-resources/es-injection.flux deleted file mode 100644 index ad0e772b3..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/es-injection.flux +++ /dev/null @@ -1,50 +0,0 @@ -name: "injection" - -includes: - - resource: true - file: "/crawler-default.yaml" - override: false - - - resource: false - file: "crawler-conf.yaml" - override: true - - - resource: false - file: "es-conf.yaml" - override: true - -spouts: - - id: "filespout" - className: "org.apache.stormcrawler.spout.FileSpout" - parallelism: 1 - constructorArgs: - - "." - - "seeds.txt" - - true - -bolts: - - id: "filter" - className: "org.apache.stormcrawler.bolt.URLFilterBolt" - parallelism: 1 - - - id: "status" - className: "org.apache.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt" - parallelism: 1 - -streams: - - from: "filespout" - to: "filter" - grouping: - type: FIELDS - args: ["url"] - streamId: "status" - - - from: "filter" - to: "status" - grouping: - streamId: "status" - type: CUSTOM - customClass: - className: "org.apache.stormcrawler.util.URLStreamGrouping" - constructorArgs: - - "byDomain" diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/kibana/importKibana.sh b/external/elasticsearch/archetype/src/main/resources/archetype-resources/kibana/importKibana.sh deleted file mode 100755 index f0a2edb50..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/kibana/importKibana.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh - -BIN=$(dirname $0) - -echo "Importing status dashboard into Kibana" -curl -X POST "localhost:5601/api/saved_objects/_import" -H "kbn-xsrf: true" --form file=@$BIN/status.ndjson -echo "" - -echo "Importing metrics dashboard into Kibana" -curl -X POST "localhost:5601/api/saved_objects/_import" -H "kbn-xsrf: true" --form file=@$BIN/metrics.ndjson -echo "" - -# Storm internal metrics -# curl -X POST "localhost:5601/api/saved_objects/_import" -H "kbn-xsrf: true" --form file=@$BIN/storm.ndjson diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/kibana/metrics.ndjson b/external/elasticsearch/archetype/src/main/resources/archetype-resources/kibana/metrics.ndjson deleted file mode 100644 index 20cbb2bc0..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/kibana/metrics.ndjson +++ /dev/null @@ -1,10 +0,0 @@ -{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="} -{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:activethreads\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : # active threads","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"2\"}}],\"listeners\":{},\"title\":\"Fetcher : # active threads\"}"},"id":"Fetcher-:-#-active-threads","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.178Z","version":"WzksMV0="} -{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:num_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : num queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : num queues\"}"},"id":"Fetcher-:-num-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.175Z","version":"WzgsMV0="} -{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : pages fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{},\"spyPerPage\":10},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"3\"}}],\"listeners\":{},\"title\":\"Fetcher : pages fetched\"}"},"id":"Fetcher-:-pages-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.170Z","version":"WzcsMV0="} -{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:in_queues\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : URLs waiting in queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"addLegend\":false,\"addTimeMarker\":false,\"addTooltip\":true,\"defaultYExtents\":false,\"mode\":\"grouped\",\"scale\":\"linear\",\"setYExtents\":false,\"shareYAxis\":true,\"spyPerPage\":10,\"times\":[],\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"4\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"5\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"srcTaskId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"4\"}}],\"listeners\":{},\"title\":\"Fetcher : URLs waiting in queues\"}"},"id":"Fetcher-:-URLs-waiting-in-queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.160Z","version":"WzUsMV0="} -{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.bytes_fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average bytes per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}}],\"listeners\":{},\"title\":\"Fetcher : average bytes per second\"}"},"id":"Fetcher-:-average-bytes-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.173Z","version":"WzYsMV0="} -{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_average_persec.fetched_perSec\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Fetcher : average pages per second","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":false,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"2\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"3\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Fetcher : average pages per second\"}"},"id":"Fetcher-:-average-pages-per-second","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.820Z","version":"WzEwLDFd"} -{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name:fetcher_counter.bytes_fetched\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Total bytes fetched","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":false,\"showCircles\":true,\"smoothLines\":true,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":9,\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"sum\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"m\",\"min_doc_count\":1,\"extended_bounds\":{}}}],\"listeners\":{},\"title\":\"Total bytes fetched\"}"},"id":"Total-bytes-fetched","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:06:59.823Z","version":"WzExLDFd"} -{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}},\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelIndex\":\"1\",\"gridData\":{\"x\":24,\"y\":20,\"w\":12,\"h\":12,\"i\":\"1\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_0\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":12,\"y\":20,\"w\":12,\"h\":12,\"i\":\"2\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_1\"},{\"panelIndex\":\"3\",\"gridData\":{\"x\":0,\"y\":0,\"w\":36,\"h\":12,\"i\":\"3\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_2\"},{\"panelIndex\":\"4\",\"gridData\":{\"x\":0,\"y\":20,\"w\":12,\"h\":12,\"i\":\"4\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_3\"},{\"panelIndex\":\"5\",\"gridData\":{\"x\":0,\"y\":40,\"w\":36,\"h\":8,\"i\":\"5\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_4\"},{\"panelIndex\":\"6\",\"gridData\":{\"x\":0,\"y\":32,\"w\":36,\"h\":8,\"i\":\"6\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_5\"},{\"panelIndex\":\"7\",\"gridData\":{\"x\":0,\"y\":12,\"w\":36,\"h\":8,\"i\":\"7\"},\"version\":\"7.3.0\",\"panelRefName\":\"panel_6\"}]","timeRestore":false,"title":"Crawl metrics","version":1},"id":"Crawl-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Fetcher-:-#-active-threads","name":"panel_0","type":"visualization"},{"id":"Fetcher-:-num-queues","name":"panel_1","type":"visualization"},{"id":"Fetcher-:-pages-fetched","name":"panel_2","type":"visualization"},{"id":"Fetcher-:-URLs-waiting-in-queues","name":"panel_3","type":"visualization"},{"id":"Fetcher-:-average-bytes-per-second","name":"panel_4","type":"visualization"},{"id":"Fetcher-:-average-pages-per-second","name":"panel_5","type":"visualization"},{"id":"Total-bytes-fetched","name":"panel_6","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:06:58.830Z","version":"WzQsMV0="} -{"exportedCount":9,"missingRefCount":0,"missingReferences":[]} diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/kibana/status.ndjson b/external/elasticsearch/archetype/src/main/resources/archetype-resources/kibana/status.ndjson deleted file mode 100644 index b3d0122e4..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/kibana/status.ndjson +++ /dev/null @@ -1,5 +0,0 @@ -{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"key\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"metadata._redirTo\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.depth\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Ecause\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.error%2Esource\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.fetch%2Eerror%2Ecount\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isFeed\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.isSitemap\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"metadata.url%2Epath\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"nextFetchDate\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"status\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"url\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":true,\"readFromDocValues\":true}]","title":"status"},"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:07:47.130Z","version":"WzEzLDFd"} -{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"status count","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"status\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"status count\"}"},"id":"status-count","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.278Z","version":"WzE1LDFd"} -{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Top Hosts","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"key\",\"size\":50,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{},\"title\":\"Top Hosts\"}"},"id":"Top-Hosts","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"7445c390-7339-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:07:48.281Z","version":"WzE2LDFd"} -{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"37874bbf-6607-435a-a231-94d81e9193e7\",\"gridData\":{\"x\":0,\"y\":0,\"w\":16,\"h\":20,\"i\":\"37874bbf-6607-435a-a231-94d81e9193e7\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"4faa5b74-1660-44f7-9227-89d900c8231e\",\"gridData\":{\"x\":16,\"y\":0,\"w\":16,\"h\":20,\"i\":\"4faa5b74-1660-44f7-9227-89d900c8231e\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Crawl status","version":1},"id":"Crawl-status","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"status-count","name":"panel_0","type":"visualization"},{"id":"Top-Hosts","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:07:47.948Z","version":"WzE0LDFd"} -{"exportedCount":4,"missingRefCount":0,"missingReferences":[]} diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/kibana/storm.ndjson b/external/elasticsearch/archetype/src/main/resources/archetype-resources/kibana/storm.ndjson deleted file mode 100644 index 880c2326f..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/kibana/storm.ndjson +++ /dev/null @@ -1,5 +0,0 @@ -{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"name\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcComponentId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcTaskId\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerHost\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"srcWorkerPort\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stormId\",\"type\":\"string\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"timestamp\",\"type\":\"date\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"value\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"timestamp","title":"metrics"},"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","migrationVersion":{"index-pattern":"6.5.0"},"references":[],"type":"index-pattern","updated_at":"2020-01-06T11:06:58.178Z","version":"WzMsMV0="} -{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"name: \\\"__receive.population\\\"\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Storm Receive Queues","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMeticsAtAllLevels\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcTaskId\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"srcComponentId\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}"},"id":"Storm-Receive-Queues","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.875Z","version":"WzIwLDFd"} -{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":{\"query_string\":{\"query\":\"+srcComponentId: \\\"__system\\\" +name: memory\\\\/heap*\",\"analyze_wildcard\":true}},\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Memory Heap","uiStateJSON":"{}","version":1,"visState":"{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":true,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"avg\",\"schema\":\"metric\",\"params\":{\"field\":\"value\"}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"timestamp\",\"interval\":\"auto\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"name\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"4\",\"type\":\"terms\",\"schema\":\"split\",\"params\":{\"field\":\"srcWorkerHost\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\",\"row\":true}}],\"listeners\":{}}"},"id":"Memory-Heap","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"b5c3bbd0-7337-11e9-9289-ffa3ee6775e4","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-01-06T11:09:12.877Z","version":"WzIxLDFd"} -{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"filter\":[],\"query\":{\"query\":\"*\",\"language\":\"lucene\"}}"},"optionsJSON":"{\"useMargins\":true,\"hidePanelTitles\":false}","panelsJSON":"[{\"panelRefName\":\"panel_0\",\"version\":\"7.3.0\",\"panelIndex\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\",\"gridData\":{\"x\":0,\"y\":0,\"w\":32,\"h\":8,\"i\":\"19123ee9-8f49-4621-a4dc-716b5ff9fcaf\"},\"embeddableConfig\":{}},{\"panelRefName\":\"panel_1\",\"version\":\"7.3.0\",\"panelIndex\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\",\"gridData\":{\"x\":0,\"y\":8,\"w\":32,\"h\":16,\"i\":\"5fd83542-b7e6-48e0-8679-2ffcacf453a3\"},\"embeddableConfig\":{}}]","timeRestore":false,"title":"Storm metrics","version":1},"id":"Storm-metrics","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"Storm-Receive-Queues","name":"panel_0","type":"visualization"},{"id":"Memory-Heap","name":"panel_1","type":"visualization"}],"type":"dashboard","updated_at":"2020-01-06T11:08:33.810Z","version":"WzE5LDFd"} -{"exportedCount":4,"missingRefCount":0,"missingReferences":[]} \ No newline at end of file diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/pom.xml b/external/elasticsearch/archetype/src/main/resources/archetype-resources/pom.xml deleted file mode 100644 index cca05f97b..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/pom.xml +++ /dev/null @@ -1,149 +0,0 @@ - - - - - - - 4.0.0 - ${groupId} - ${artifactId} - ${version} - jar - - ${artifactId} - - - UTF-8 - ${StormCrawlerVersion} - 2.6.1 - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.11.0 - - 11 - 11 - - - - org.codehaus.mojo - exec-maven-plugin - 3.1.0 - - - - exec - - - - - java - true - false - compile - - - - org.apache.maven.plugins - maven-shade-plugin - 3.5.0 - - - package - - shade - - - false - - - - org.apache.storm.flux.Flux - - - - - - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - org.apache.storm:flux-core - - org/apache/commons/** - org/apache/http/** - org/yaml/** - - - - - - - - - - - - - org.apache.stormcrawler - storm-crawler-core - ${stormcrawler.version} - - - org.apache.stormcrawler - storm-crawler-elasticsearch - ${stormcrawler.version} - - - org.apache.storm - storm-client - ${storm.version} - provided - - - org.apache.storm - flux-core - ${storm.version} - - - org.apache.stormcrawler - storm-crawler-tika - ${stormcrawler.version} - - - diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/java/ESCrawlTopology.java b/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/java/ESCrawlTopology.java deleted file mode 100644 index c9eeaf713..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/java/ESCrawlTopology.java +++ /dev/null @@ -1,110 +0,0 @@ -#set($symbol_pound='#')#set($symbol_dollar='$')#set($symbol_escape='\') - -/** - * Licensed to DigitalPebble Ltd under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * DigitalPebble licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ${package}; - -import org.apache.storm.metric.LoggingMetricsConsumer; -import org.apache.storm.topology.TopologyBuilder; -import org.apache.storm.tuple.Fields; - -import org.apache.stormcrawler.ConfigurableTopology; -import org.apache.stormcrawler.Constants; -import org.apache.stormcrawler.bolt.FetcherBolt; -import org.apache.stormcrawler.bolt.JSoupParserBolt; -import org.apache.stormcrawler.bolt.SiteMapParserBolt; -import org.apache.stormcrawler.bolt.URLFilterBolt; -import org.apache.stormcrawler.bolt.URLPartitionerBolt; -import org.apache.stormcrawler.elasticsearch.bolt.DeletionBolt; -import org.apache.stormcrawler.elasticsearch.bolt.IndexerBolt; -import org.apache.stormcrawler.elasticsearch.metrics.MetricsConsumer; -import org.apache.stormcrawler.elasticsearch.metrics.StatusMetricsBolt; -import org.apache.stormcrawler.elasticsearch.persistence.AggregationSpout; -import org.apache.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt; -import org.apache.stormcrawler.spout.FileSpout; -import org.apache.stormcrawler.util.ConfUtils; -import org.apache.stormcrawler.util.URLStreamGrouping; -import org.apache.stormcrawler.tika.ParserBolt; -import org.apache.stormcrawler.tika.RedirectionBolt; - -/** - * Dummy topology to play with the spouts and bolts on ElasticSearch - */ -public class ESCrawlTopology extends ConfigurableTopology { - - public static void main(String[] args) throws Exception { - ConfigurableTopology.start(new ESCrawlTopology(), args); - } - - @Override - protected int run(String[] args) { - TopologyBuilder builder = new TopologyBuilder(); - - int numWorkers = ConfUtils.getInt(getConf(), "topology.workers", 1); - - if (args.length == 0) { - System.err.println("ESCrawlTopology seed_dir file_filter"); - return -1; - } - - // set to the real number of shards ONLY if es.status.routing is set to - // true in the configuration - int numShards = 1; - - builder.setSpout("filespout", new FileSpout(args[0], args[1], true)); - - Fields key = new Fields("url"); - - builder.setBolt("filter", new URLFilterBolt()).fieldsGrouping("filespout", Constants.StatusStreamName, key); - - builder.setSpout("spout", new AggregationSpout(), numShards); - - builder.setBolt("status_metrics", new StatusMetricsBolt()).shuffleGrouping("spout"); - - builder.setBolt("partitioner", new URLPartitionerBolt(), numWorkers).shuffleGrouping("spout"); - - builder.setBolt("fetch", new FetcherBolt(), numWorkers).fieldsGrouping("partitioner", new Fields("key")); - - builder.setBolt("sitemap", new SiteMapParserBolt(), numWorkers).localOrShuffleGrouping("fetch"); - - builder.setBolt("parse", new JSoupParserBolt(), numWorkers).localOrShuffleGrouping("sitemap"); - - builder.setBolt("shunt", new RedirectionBolt()).localOrShuffleGrouping("parse"); - - builder.setBolt("tika", new ParserBolt()).localOrShuffleGrouping("shunt", "tika"); - - builder.setBolt("indexer", new IndexerBolt(), numWorkers).localOrShuffleGrouping("shunt") - .localOrShuffleGrouping("tika"); - - builder.setBolt("status", new StatusUpdaterBolt(), numWorkers) - .fieldsGrouping("fetch", Constants.StatusStreamName, key) - .fieldsGrouping("sitemap", Constants.StatusStreamName, key) - .fieldsGrouping("parse", Constants.StatusStreamName, key) - .fieldsGrouping("tika", Constants.StatusStreamName, key) - .fieldsGrouping("indexer", Constants.StatusStreamName, key) - .customGrouping("filter", Constants.StatusStreamName, new URLStreamGrouping()); - - builder.setBolt("deleter", new DeletionBolt(), numWorkers).localOrShuffleGrouping("status", - Constants.DELETION_STREAM_NAME); - - conf.registerMetricsConsumer(MetricsConsumer.class); - conf.registerMetricsConsumer(LoggingMetricsConsumer.class); - - return submit("crawl", conf, builder); - } -} diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt b/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt deleted file mode 100644 index 389ef587b..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-filters.txt +++ /dev/null @@ -1,32 +0,0 @@ -# skip file: ftp: and mailto: urls --^(file|ftp|mailto): - -# skip image and other suffixes we can't parse or are not likely to be relevant -# if you want to crawl images or videos or archives then you should comment out this line --(?i)\.(apk|deb|cab|iso|gif|jpg|png|svg|ico|css|sit|eps|wmf|rar|tar|jar|zip|gz|bz2|rpm|tgz|mov|exe|jpeg|jpe|bmp|js|mpg|mp3|mp4|m4a|ogv|kml|wmv|swf|flv|mkv|m4v|webm|ra|wma|wav|avi|xspf|m3u)(\?|&|$) - -# skip URLs with slash-delimited segment that repeats 3+ times, to break loops -# very time-consuming : use BasicURLFilter instead -# -.*(/[^/]+)/[^/]+\1/[^/]+\1/ - -# exclude localhost and equivalents to avoid that information -# can be leaked by placing faked links pointing to web interfaces -# of services running on the crawling machine (e.g., Elasticsearch, -# Storm) -# -# - exclude localhost and loop-back addresses -# http://localhost:8080 -# http://127.0.0.1/ .. http://127.255.255.255/ -# http://[::1]/ --^https?://(?:localhost|127(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){3}|\[::1\])(?::\d+)?(?:/|$) -# -# - exclude private IP address spaces -# 10.0.0.0/8 --^https?://(?:10(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){3})(?::\d+)?(?:/|$) -# 192.168.0.0/16 --^https?://(?:192\.168(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){2})(?::\d+)?(?:/|$) -# 172.16.0.0/12 --^https?://(?:172\.(?:1[6789]|2[0-9]|3[01])(?:\.(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))){2})(?::\d+)?(?:/|$) - -# accept anything else -+. diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml b/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml deleted file mode 100644 index 101bfd6b5..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/default-regex-normalizers.xml +++ /dev/null @@ -1,78 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json b/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json deleted file mode 100644 index 4d87d8d5a..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/jsoupfilters.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "org.apache.stormcrawler.parse.JSoupFilters": [ - { - "class": "org.apache.stormcrawler.jsoup.XPathFilter", - "name": "XPathFilter", - "params": { - "canonical": "//*[@rel=\"canonical\"]/@href", - "parse.description": [ - "//*[@name=\"description\"]/@content", - "//*[@name=\"Description\"]/@content" - ], - "parse.title": [ - "//TITLE/allText()", - "//META[@name=\"title\"]/@content" - ], - "parse.keywords": "//META[@name=\"keywords\"]/@content" - } - }, - { - "class": "org.apache.stormcrawler.jsoup.LinkParseFilter", - "name": "LinkParseFilter", - "params": { - "pattern": "//FRAME/@src" - } - } - ] -} diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json b/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json deleted file mode 100644 index 5d525830d..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/parsefilters.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "org.apache.stormcrawler.parse.ParseFilters": [ - { - "class": "org.apache.stormcrawler.parse.filter.DomainParseFilter", - "name": "DomainParseFilter", - "params": { - "key": "domain", - "byHost": false - } - }, - { - "class": "org.apache.stormcrawler.parse.filter.MimeTypeNormalization", - "name": "MimeTypeNormalization" - }, - { - "class": "org.apache.stormcrawler.parse.filter.CommaSeparatedToMultivaluedMetadata", - "name": "CommaSeparatedToMultivaluedMetadata", - "params": { - "keys": ["parse.keywords"] - } - } - ] -} diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json b/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json deleted file mode 100644 index 6098631bb..000000000 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/src/main/resources/urlfilters.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "org.apache.stormcrawler.filtering.URLFilters": [ - { - "class": "org.apache.stormcrawler.filtering.basic.BasicURLFilter", - "name": "BasicURLFilter", - "params": { - "maxPathRepetition": 3, - "maxLength": 1024 - } - }, - { - "class": "org.apache.stormcrawler.filtering.depth.MaxDepthFilter", - "name": "MaxDepthFilter", - "params": { - "maxDepth": -1 - } - }, - { - "class": "org.apache.stormcrawler.filtering.basic.BasicURLNormalizer", - "name": "BasicURLNormalizer", - "params": { - "removeAnchorPart": true, - "unmangleQueryString": true, - "checkValidURI": true, - "removeHashes": true, - "hostIDNtoASCII": true - } - }, - { - "class": "org.apache.stormcrawler.filtering.host.HostURLFilter", - "name": "HostURLFilter", - "params": { - "ignoreOutsideHost": false, - "ignoreOutsideDomain": true - } - }, - { - "class": "org.apache.stormcrawler.filtering.regex.RegexURLNormalizer", - "name": "RegexURLNormalizer", - "params": { - "regexNormalizerFile": "default-regex-normalizers.xml" - } - }, - { - "class": "org.apache.stormcrawler.filtering.regex.RegexURLFilter", - "name": "RegexURLFilter", - "params": { - "regexFilterFile": "default-regex-filters.txt" - } - }, - { - "class": "org.apache.stormcrawler.filtering.basic.SelfURLFilter", - "name": "SelfURLFilter" - }, - { - "class": "org.apache.stormcrawler.filtering.sitemap.SitemapFilter", - "name": "SitemapFilter" - } - ] -} diff --git a/external/elasticsearch/kibana b/external/elasticsearch/kibana deleted file mode 120000 index 26554b3bc..000000000 --- a/external/elasticsearch/kibana +++ /dev/null @@ -1 +0,0 @@ -archetype/src/main/resources/archetype-resources/kibana \ No newline at end of file diff --git a/external/elasticsearch/pom.xml b/external/elasticsearch/pom.xml deleted file mode 100644 index dc5f06158..000000000 --- a/external/elasticsearch/pom.xml +++ /dev/null @@ -1,93 +0,0 @@ - - - - - - 4.0.0 - - - org.apache.stormcrawler - storm-crawler-external - 2.12-SNAPSHOT - ../pom.xml - - - - 7.17.7 - - - storm-crawler-elasticsearch - jar - - storm-crawler-elasticsearch - https://github.com/apache/incubator-stormcrawler/tree/master/external/elasticsearch - Elasticsearch resources for StormCrawler - - - - - maven-surefire-plugin - - - default-test - test - - test - - - - - - ${elasticsearch.version} - - - - - - - - - org.elasticsearch.client - elasticsearch-rest-high-level-client - ${elasticsearch.version} - - - org.elasticsearch.client - elasticsearch-rest-client-sniffer - ${elasticsearch.version} - - - - - org.testcontainers - elasticsearch - test - - - - org.apache.stormcrawler - storm-crawler-core - ${project.version} - test-jar - test - - - - diff --git a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/BulkItemResponseToFailedFlag.java b/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/BulkItemResponseToFailedFlag.java deleted file mode 100644 index d9492215e..000000000 --- a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/BulkItemResponseToFailedFlag.java +++ /dev/null @@ -1,113 +0,0 @@ -package org.apache.stormcrawler.elasticsearch; - -import java.io.IOException; -import org.elasticsearch.action.DocWriteRequest; -import org.elasticsearch.action.DocWriteResponse; -import org.elasticsearch.action.bulk.BulkItemResponse; -import org.elasticsearch.common.io.stream.StreamOutput; -import org.elasticsearch.rest.RestStatus; -import org.elasticsearch.xcontent.ToXContent; -import org.elasticsearch.xcontent.XContentBuilder; -import org.jetbrains.annotations.NotNull; - -public final class BulkItemResponseToFailedFlag { - @NotNull public final BulkItemResponse response; - public final boolean failed; - @NotNull public final String id; - - public BulkItemResponseToFailedFlag(@NotNull BulkItemResponse response, boolean failed) { - this.response = response; - this.failed = failed; - this.id = response.getId(); - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof BulkItemResponseToFailedFlag)) return false; - - BulkItemResponseToFailedFlag that = (BulkItemResponseToFailedFlag) o; - - if (failed != that.failed) return false; - if (!response.equals(that.response)) return false; - return id.equals(that.id); - } - - @Override - public int hashCode() { - int result = response.hashCode(); - result = 31 * result + (failed ? 1 : 0); - result = 31 * result + id.hashCode(); - return result; - } - - @Override - public String toString() { - return "BulkItemResponseToFailedFlag{" - + "response=" - + response - + ", failed=" - + failed - + ", id='" - + id - + '\'' - + '}'; - } - - public RestStatus status() { - return response.status(); - } - - public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params params) - throws IOException { - return response.toXContent(builder, params); - } - - public int getItemId() { - return response.getItemId(); - } - - public DocWriteRequest.OpType getOpType() { - return response.getOpType(); - } - - public String getIndex() { - return response.getIndex(); - } - - public String getType() { - return response.getType(); - } - - public long getVersion() { - return response.getVersion(); - } - - public T getResponse() { - return response.getResponse(); - } - - public boolean isFailed() { - return response.isFailed(); - } - - public String getFailureMessage() { - return response.getFailureMessage(); - } - - public BulkItemResponse.Failure getFailure() { - return response.getFailure(); - } - - public void writeTo(StreamOutput out) throws IOException { - response.writeTo(out); - } - - public void writeThin(StreamOutput out) throws IOException { - response.writeThin(out); - } - - public boolean isFragment() { - return response.isFragment(); - } -} diff --git a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/ElasticSearchConnection.java b/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/ElasticSearchConnection.java deleted file mode 100644 index 05e435f86..000000000 --- a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/ElasticSearchConnection.java +++ /dev/null @@ -1,297 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.elasticsearch; - -import static org.elasticsearch.client.RestClientBuilder.DEFAULT_CONNECT_TIMEOUT_MILLIS; -import static org.elasticsearch.client.RestClientBuilder.DEFAULT_SOCKET_TIMEOUT_MILLIS; - -import java.io.IOException; -import java.net.URI; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.concurrent.TimeUnit; -import org.apache.http.HttpHost; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.CredentialsProvider; -import org.apache.http.impl.client.BasicCredentialsProvider; -import org.apache.storm.shade.org.apache.commons.lang.StringUtils; -import org.apache.stormcrawler.util.ConfUtils; -import org.elasticsearch.action.bulk.BulkProcessor; -import org.elasticsearch.action.bulk.BulkRequest; -import org.elasticsearch.action.bulk.BulkResponse; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.client.Node; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestClient; -import org.elasticsearch.client.RestClientBuilder; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.RestHighLevelClientBuilder; -import org.elasticsearch.client.sniff.Sniffer; -import org.elasticsearch.core.TimeValue; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** Utility class to instantiate an ES client and bulkprocessor based on the configuration. */ -public final class ElasticSearchConnection { - - private static final Logger LOG = LoggerFactory.getLogger(ElasticSearchConnection.class); - - @NotNull private final RestHighLevelClient client; - - @NotNull private final BulkProcessor processor; - - @Nullable private final Sniffer sniffer; - - private ElasticSearchConnection(@NotNull RestHighLevelClient c, @NotNull BulkProcessor p) { - this(c, p, null); - } - - private ElasticSearchConnection( - @NotNull RestHighLevelClient c, @NotNull BulkProcessor p, @Nullable Sniffer s) { - processor = p; - client = c; - sniffer = s; - } - - public RestHighLevelClient getClient() { - return client; - } - - public void addToProcessor(final IndexRequest request) { - processor.add(request); - } - - public static RestHighLevelClient getClient(Map stormConf, String boltType) { - - List confighosts = - ConfUtils.loadListFromConf("es." + boltType + ".addresses", stormConf); - - List hosts = new ArrayList<>(); - - for (String host : confighosts) { - // no port specified? use default one - int port = 9200; - String scheme = "http"; - // no scheme specified? use http - if (!host.startsWith(scheme)) { - host = "http://" + host; - } - URI uri = URI.create(host); - if (uri.getHost() == null) { - throw new RuntimeException("host undefined " + host); - } - if (uri.getPort() != -1) { - port = uri.getPort(); - } - if (uri.getScheme() != null) { - scheme = uri.getScheme(); - } - hosts.add(new HttpHost(uri.getHost(), port, scheme)); - } - - RestClientBuilder builder = RestClient.builder(hosts.toArray(new HttpHost[0])); - - // authentication via user / password - String user = ConfUtils.getString(stormConf, "es." + boltType + ".user"); - String password = ConfUtils.getString(stormConf, "es." + boltType + ".password"); - - String proxyhost = ConfUtils.getString(stormConf, "es." + boltType + ".proxy.host"); - - int proxyport = ConfUtils.getInt(stormConf, "es." + boltType + ".proxy.port", -1); - - String proxyscheme = - ConfUtils.getString(stormConf, "es." + boltType + ".proxy.scheme", "http"); - - boolean needsUser = StringUtils.isNotBlank(user) && StringUtils.isNotBlank(password); - boolean needsProxy = StringUtils.isNotBlank(proxyhost) && proxyport != -1; - - if (needsUser || needsProxy) { - builder.setHttpClientConfigCallback( - httpClientBuilder -> { - if (needsUser) { - final CredentialsProvider credentialsProvider = - new BasicCredentialsProvider(); - credentialsProvider.setCredentials( - AuthScope.ANY, new UsernamePasswordCredentials(user, password)); - httpClientBuilder.setDefaultCredentialsProvider(credentialsProvider); - } - if (needsProxy) { - httpClientBuilder.setProxy( - new HttpHost(proxyhost, proxyport, proxyscheme)); - } - return httpClientBuilder; - }); - } - - int connectTimeout = - ConfUtils.getInt( - stormConf, - "es." + boltType + ".connect.timeout", - DEFAULT_CONNECT_TIMEOUT_MILLIS); - int socketTimeout = - ConfUtils.getInt( - stormConf, - "es." + boltType + ".socket.timeout", - DEFAULT_SOCKET_TIMEOUT_MILLIS); - // timeout until connection is established - builder.setRequestConfigCallback( - requestConfigBuilder -> - requestConfigBuilder - .setConnectTimeout(connectTimeout) - .setSocketTimeout(socketTimeout) // Timeout when waiting - // for data - ); - - // TODO check if this has gone somewhere else in ES 7 - // int maxRetryTimeout = ConfUtils.getInt(stormConf, "es." + boltType + - // ".max.retry.timeout", - // DEFAULT_MAX_RETRY_TIMEOUT_MILLIS); - // builder.setMaxRetryTimeoutMillis(maxRetryTimeout); - - // TODO configure headers etc... - // Map configSettings = (Map) stormConf - // .get("es." + boltType + ".settings"); - // if (configSettings != null) { - // configSettings.forEach((k, v) -> settings.put(k, v)); - // } - - // use node selector only to log nodes listed in the config - // and/or discovered through sniffing - builder.setNodeSelector( - nodes -> { - for (Node node : nodes) { - LOG.debug( - "Connected to ES node {} [{}] for {}", - node.getName(), - node.getHost(), - boltType); - } - }); - - final boolean compression = - ConfUtils.getBoolean(stormConf, "es." + boltType + ".compression", false); - - builder.setCompressionEnabled(compression); - - final boolean compatibilityMode = - ConfUtils.getBoolean(stormConf, "es." + boltType + ".compatibility.mode", false); - - return new RestHighLevelClientBuilder(builder.build()) - .setApiCompatibilityMode(compatibilityMode) - .build(); - } - - /** - * Creates a connection with a default listener. The values for bolt type are - * [indexer,status,metrics] - */ - public static ElasticSearchConnection getConnection( - Map stormConf, String boltType) { - BulkProcessor.Listener listener = - new BulkProcessor.Listener() { - @Override - public void afterBulk(long arg0, BulkRequest arg1, BulkResponse arg2) {} - - @Override - public void afterBulk(long arg0, BulkRequest arg1, Throwable arg2) {} - - @Override - public void beforeBulk(long arg0, BulkRequest arg1) {} - }; - return getConnection(stormConf, boltType, listener); - } - - public static ElasticSearchConnection getConnection( - Map stormConf, String boltType, BulkProcessor.Listener listener) { - - String flushIntervalString = - ConfUtils.getString(stormConf, "es." + boltType + ".flushInterval", "5s"); - - TimeValue flushInterval = - TimeValue.parseTimeValue( - flushIntervalString, TimeValue.timeValueSeconds(5), "flushInterval"); - - int bulkActions = ConfUtils.getInt(stormConf, "es." + boltType + ".bulkActions", 50); - - int concurrentRequests = - ConfUtils.getInt(stormConf, "es." + boltType + ".concurrentRequests", 1); - - RestHighLevelClient client = getClient(stormConf, boltType); - - boolean sniff = ConfUtils.getBoolean(stormConf, "es." + boltType + ".sniff", true); - Sniffer sniffer = null; - if (sniff) { - sniffer = Sniffer.builder(client.getLowLevelClient()).build(); - } - - BulkProcessor bulkProcessor = - BulkProcessor.builder( - (request, bulkListener) -> - client.bulkAsync( - request, RequestOptions.DEFAULT, bulkListener), - listener, - boltType + "-bulk-processor") - .setFlushInterval(flushInterval) - .setBulkActions(bulkActions) - .setConcurrentRequests(concurrentRequests) - .build(); - - return new ElasticSearchConnection(client, bulkProcessor, sniffer); - } - - private boolean isClosed = false; - - public void close() { - - if (isClosed) { - LOG.warn("Tried to close an already closed connection!"); - return; - } - - // Maybe some kind of identifier? - LOG.debug("Start closing the ElasticSearchConnection"); - - // First, close the BulkProcessor ensuring pending actions are flushed - try { - boolean success = processor.awaitClose(60, TimeUnit.SECONDS); - if (!success) { - throw new RuntimeException( - "Failed to flush pending actions when closing BulkProcessor"); - } - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - - if (sniffer != null) { - sniffer.close(); - } - - // Now close the actual client - try { - client.close(); - } catch (IOException e) { - // ignore silently - LOG.trace("Client threw IO exception."); - } - - isClosed = true; - } -} diff --git a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/bolt/DeletionBolt.java b/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/bolt/DeletionBolt.java deleted file mode 100644 index cbdfa9f94..000000000 --- a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/bolt/DeletionBolt.java +++ /dev/null @@ -1,106 +0,0 @@ -package org.apache.stormcrawler.elasticsearch.bolt; - -import java.io.IOException; -import java.lang.invoke.MethodHandles; -import java.util.Map; -import org.apache.storm.task.OutputCollector; -import org.apache.storm.task.TopologyContext; -import org.apache.storm.topology.OutputFieldsDeclarer; -import org.apache.storm.topology.base.BaseRichBolt; -import org.apache.storm.tuple.Tuple; -import org.apache.stormcrawler.Metadata; -import org.apache.stormcrawler.elasticsearch.ElasticSearchConnection; -import org.apache.stormcrawler.util.ConfUtils; -import org.elasticsearch.action.delete.DeleteRequest; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.slf4j.LoggerFactory; - -/** - * Deletes documents to ElasticSearch. This should be connected to the StatusUpdaterBolt via the - * 'deletion' stream and will remove the documents with a status of ERROR one by one. Note that this - * component will also try to delete documents even though they were never indexed and it currently - * won't delete documents which were indexed under the canonical URL. - */ -public class DeletionBolt extends BaseRichBolt { - - static final org.slf4j.Logger LOG = - LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private static final String ESBoltType = "indexer"; - - private OutputCollector _collector; - - private String indexName; - - private RestHighLevelClient client; - - public DeletionBolt() {} - - /** Sets the index name instead of taking it from the configuration. * */ - public DeletionBolt(String indexName) { - this.indexName = indexName; - } - - @Override - public void prepare( - Map conf, TopologyContext context, OutputCollector collector) { - _collector = collector; - if (indexName == null) { - indexName = ConfUtils.getString(conf, IndexerBolt.ESIndexNameParamName, "content"); - } - client = ElasticSearchConnection.getClient(conf, ESBoltType); - } - - @Override - public void cleanup() { - if (client != null) - try { - client.close(); - } catch (IOException e) { - } - } - - @Override - public void execute(Tuple tuple) { - String url = tuple.getStringByField("url"); - Metadata metadata = (Metadata) tuple.getValueByField("metadata"); - - // keep it simple for now and ignore cases where the canonical URL was - // used - String docID = getDocumentID(metadata, url); - DeleteRequest dr = new DeleteRequest(getIndexName(metadata), docID); - try { - client.delete(dr, RequestOptions.DEFAULT); - } catch (IOException e) { - _collector.fail(tuple); - LOG.error("Exception caught while deleting", e); - return; - } - _collector.ack(tuple); - } - - /** - * Get the document id. - * - * @param metadata The {@link Metadata}. - * @param url The normalised url. - * @return Return the normalised url SHA-256 digest as String. - */ - protected String getDocumentID(Metadata metadata, String url) { - return org.apache.commons.codec.digest.DigestUtils.sha256Hex(url); - } - - @Override - public void declareOutputFields(OutputFieldsDeclarer arg0) { - // none - } - - /** - * Must be overridden for implementing custom index names based on some metadata information By - * Default, indexName coming from config is used - */ - protected String getIndexName(Metadata m) { - return indexName; - } -} diff --git a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/bolt/IndexerBolt.java b/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/bolt/IndexerBolt.java deleted file mode 100644 index e0a86fa82..000000000 --- a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/bolt/IndexerBolt.java +++ /dev/null @@ -1,454 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.elasticsearch.bolt; - -import static org.apache.stormcrawler.Constants.StatusStreamName; -import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder; - -import com.github.benmanes.caffeine.cache.Cache; -import com.github.benmanes.caffeine.cache.Caffeine; -import com.github.benmanes.caffeine.cache.RemovalCause; -import com.github.benmanes.caffeine.cache.RemovalListener; -import java.io.IOException; -import java.util.*; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.locks.ReentrantLock; -import java.util.stream.Collectors; -import org.apache.commons.lang.StringUtils; -import org.apache.storm.metric.api.MultiCountMetric; -import org.apache.storm.metric.api.MultiReducedMetric; -import org.apache.storm.task.OutputCollector; -import org.apache.storm.task.TopologyContext; -import org.apache.storm.tuple.Tuple; -import org.apache.storm.tuple.Values; -import org.apache.stormcrawler.Constants; -import org.apache.stormcrawler.Metadata; -import org.apache.stormcrawler.elasticsearch.BulkItemResponseToFailedFlag; -import org.apache.stormcrawler.elasticsearch.ElasticSearchConnection; -import org.apache.stormcrawler.indexing.AbstractIndexerBolt; -import org.apache.stormcrawler.persistence.Status; -import org.apache.stormcrawler.util.ConfUtils; -import org.apache.stormcrawler.util.PerSecondReducer; -import org.elasticsearch.action.DocWriteRequest; -import org.elasticsearch.action.bulk.BulkItemResponse; -import org.elasticsearch.action.bulk.BulkProcessor; -import org.elasticsearch.action.bulk.BulkRequest; -import org.elasticsearch.action.bulk.BulkResponse; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.rest.RestStatus; -import org.elasticsearch.xcontent.XContentBuilder; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Sends documents to ElasticSearch. Indexes all the fields from the tuples or a Map - * <String,Object> from a named field. - */ -public class IndexerBolt extends AbstractIndexerBolt - implements RemovalListener>, BulkProcessor.Listener { - - private static final Logger LOG = LoggerFactory.getLogger(IndexerBolt.class); - - private static final String ESBoltType = "indexer"; - - static final String ESIndexNameParamName = "es.indexer.index.name"; - private static final String ESCreateParamName = "es.indexer.create"; - private static final String ESIndexPipelineParamName = "es.indexer.pipeline"; - - private OutputCollector _collector; - - private String indexName; - - private String pipeline; - - // whether the document will be created only if it does not exist or - // overwritten - private boolean create = false; - - private MultiCountMetric eventCounter; - - private ElasticSearchConnection connection; - - private MultiReducedMetric perSecMetrics; - - private Cache> waitAck; - - // Be fair due to cache timeout - private final ReentrantLock waitAckLock = new ReentrantLock(true); - - public IndexerBolt() {} - - /** Sets the index name instead of taking it from the configuration. * */ - public IndexerBolt(String indexName) { - this.indexName = indexName; - } - - @Override - public void prepare( - Map conf, TopologyContext context, OutputCollector collector) { - super.prepare(conf, context, collector); - _collector = collector; - if (indexName == null) { - indexName = ConfUtils.getString(conf, IndexerBolt.ESIndexNameParamName, "content"); - } - - create = ConfUtils.getBoolean(conf, IndexerBolt.ESCreateParamName, false); - pipeline = ConfUtils.getString(conf, IndexerBolt.ESIndexPipelineParamName); - - try { - connection = ElasticSearchConnection.getConnection(conf, ESBoltType, this); - } catch (Exception e1) { - LOG.error("Can't connect to ElasticSearch", e1); - throw new RuntimeException(e1); - } - - this.eventCounter = - context.registerMetric("ElasticSearchIndexer", new MultiCountMetric(), 10); - - this.perSecMetrics = - context.registerMetric( - "Indexer_average_persec", - new MultiReducedMetric(new PerSecondReducer()), - 10); - - waitAck = - Caffeine.newBuilder() - .expireAfterWrite(60, TimeUnit.SECONDS) - .removalListener(this) - .build(); - - context.registerMetric("waitAck", () -> waitAck.estimatedSize(), 10); - } - - public void onRemoval( - @Nullable String key, @Nullable List value, @NotNull RemovalCause cause) { - if (!cause.wasEvicted()) return; - if (value != null) { - LOG.error("Purged from waitAck {} with {} values", key, value.size()); - for (Tuple t : value) { - _collector.fail(t); - } - } else { - // This should never happen, but log it anyway. - LOG.error("Purged from waitAck {} with no values", key); - } - } - - @Override - public void cleanup() { - if (connection != null) connection.close(); - } - - @Override - public void execute(Tuple tuple) { - - String url = tuple.getStringByField("url"); - - // Distinguish the value used for indexing - // from the one used for the status - String normalisedurl = valueForURL(tuple); - - LOG.info("Indexing {} as {}", url, normalisedurl); - - Metadata metadata = (Metadata) tuple.getValueByField("metadata"); - - boolean keep = filterDocument(metadata); - if (!keep) { - LOG.info("Filtered {}", url); - eventCounter.scope("Filtered").incrBy(1); - // treat it as successfully processed even if - // we do not index it - _collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.FETCHED)); - _collector.ack(tuple); - return; - } - - String docID = getDocumentID(metadata, normalisedurl); - - try { - XContentBuilder builder = jsonBuilder().startObject(); - - // display text of the document? - if (StringUtils.isNotBlank(fieldNameForText())) { - final String text = trimText(tuple.getStringByField("text")); - if (!ignoreEmptyFields() || StringUtils.isNotBlank(text)) { - builder.field(fieldNameForText(), trimText(text)); - } - } - - // send URL as field? - if (StringUtils.isNotBlank(fieldNameForURL())) { - builder.field(fieldNameForURL(), normalisedurl); - } - - // which metadata to display? - Map keyVals = filterMetadata(metadata); - - for (String fieldName : keyVals.keySet()) { - String[] values = keyVals.get(fieldName); - if (values.length == 1) { - if (!ignoreEmptyFields() || StringUtils.isNotBlank(values[0])) { - builder.field(fieldName, values[0]); - } - } else if (values.length > 1) { - builder.array(fieldName, values); - } - } - - builder.endObject(); - - IndexRequest indexRequest = - new IndexRequest(getIndexName(metadata)).source(builder).id(docID); - - DocWriteRequest.OpType optype = DocWriteRequest.OpType.INDEX; - - if (create) { - optype = DocWriteRequest.OpType.CREATE; - } - - indexRequest.opType(optype); - - if (pipeline != null) { - indexRequest.setPipeline(pipeline); - } - - connection.addToProcessor(indexRequest); - - eventCounter.scope("Indexed").incrBy(1); - perSecMetrics.scope("Indexed").update(1); - - waitAckLock.lock(); - try { - List tt = waitAck.getIfPresent(docID); - if (tt == null) { - tt = new LinkedList<>(); - waitAck.put(docID, tt); - } - tt.add(tuple); - LOG.debug("Added to waitAck {} with ID {} total {}", url, docID, tt.size()); - } finally { - waitAckLock.unlock(); - } - } catch (IOException e) { - LOG.error("Error building document for ES", e); - // do not send to status stream so that it gets replayed - _collector.fail(tuple); - - waitAckLock.lock(); - try { - waitAck.invalidate(docID); - } finally { - waitAckLock.unlock(); - } - } - } - - /** - * Must be overridden for implementing custom index names based on some metadata information By - * Default, indexName coming from config is used - */ - protected String getIndexName(Metadata m) { - return indexName; - } - - @Override - public void beforeBulk(long executionId, BulkRequest request) { - eventCounter.scope("bulks_sent").incrBy(1); - } - - @Override - public void afterBulk(long executionId, BulkRequest request, BulkResponse response) { - eventCounter.scope("bulks_received").incrBy(1); - eventCounter.scope("bulk_msec").incrBy(response.getTook().getMillis()); - - var idsToBulkItemsWithFailedFlag = - Arrays.stream(response.getItems()) - .map( - bir -> { - String id = bir.getId(); - BulkItemResponse.Failure f = bir.getFailure(); - boolean failed = false; - if (f != null) { - if (f.getStatus().equals(RestStatus.CONFLICT)) { - eventCounter.scope("doc_conflicts").incrBy(1); - LOG.debug("Doc conflict ID {}", id); - } else { - failed = true; - } - } - return new BulkItemResponseToFailedFlag(bir, failed); - }) - .collect( - // https://github.com/DigitalPebble/storm-crawler/issues/832 - Collectors.groupingBy( - idWithFailedFlagTuple -> idWithFailedFlagTuple.id, - Collectors.toUnmodifiableList())); - - Map> presentTuples; - long estimatedSize; - Set debugInfo = null; - waitAckLock.lock(); - try { - presentTuples = waitAck.getAllPresent(idsToBulkItemsWithFailedFlag.keySet()); - if (!presentTuples.isEmpty()) { - waitAck.invalidateAll(presentTuples.keySet()); - } - estimatedSize = waitAck.estimatedSize(); - // Only if we have to. - if (LOG.isDebugEnabled() && estimatedSize > 0L) { - debugInfo = new HashSet<>(waitAck.asMap().keySet()); - } - } finally { - waitAckLock.unlock(); - } - - int ackCount = 0; - int failureCount = 0; - - for (var entry : presentTuples.entrySet()) { - final var id = entry.getKey(); - final var associatedTuple = entry.getValue(); - final var bulkItemsWithFailedFlag = idsToBulkItemsWithFailedFlag.get(id); - - BulkItemResponseToFailedFlag selected; - - if (bulkItemsWithFailedFlag.size() == 1) { - selected = bulkItemsWithFailedFlag.get(0); - } else { - // Fallback if there are multiple responses for the same id - BulkItemResponseToFailedFlag tmp = null; - var ctFailed = 0; - for (var buwff : bulkItemsWithFailedFlag) { - if (tmp == null) { - tmp = buwff; - } - if (buwff.failed) ctFailed++; - else tmp = buwff; - } - if (ctFailed != bulkItemsWithFailedFlag.size()) { - LOG.warn( - "The id {} would result in an ack and a failure. Using only the ack for processing.", - id); - } - selected = Objects.requireNonNull(tmp); - } - - if (associatedTuple != null) { - LOG.debug("Found {} tuple(s) for ID {}", associatedTuple.size(), id); - for (Tuple t : associatedTuple) { - String url = (String) t.getValueByField("url"); - - Metadata metadata = (Metadata) t.getValueByField("metadata"); - - if (!selected.failed) { - ackCount++; - _collector.emit( - StatusStreamName, t, new Values(url, metadata, Status.FETCHED)); - _collector.ack(t); - } else { - failureCount++; - var failure = selected.getFailure(); - LOG.error("update ID {}, URL {}, failure: {}", id, url, failure); - // there is something wrong with the content we should - // treat - // it as an ERROR - if (selected.getFailure().getStatus().equals(RestStatus.BAD_REQUEST)) { - metadata.setValue(Constants.STATUS_ERROR_SOURCE, "ES indexing"); - metadata.setValue(Constants.STATUS_ERROR_MESSAGE, "invalid content"); - _collector.emit( - StatusStreamName, t, new Values(url, metadata, Status.ERROR)); - _collector.ack(t); - LOG.debug("Acked {} with ID {}", url, id); - } else { - LOG.error("update ID {}, URL {}, failure: {}", id, url, failure); - // there is something wrong with the content we - // should - // treat - // it as an ERROR - if (failure.getStatus().equals(RestStatus.BAD_REQUEST)) { - metadata.setValue(Constants.STATUS_ERROR_SOURCE, "ES indexing"); - metadata.setValue( - Constants.STATUS_ERROR_MESSAGE, "invalid content"); - _collector.emit( - StatusStreamName, - t, - new Values(url, metadata, Status.ERROR)); - _collector.ack(t); - } - // otherwise just fail it - else { - _collector.fail(t); - } - } - } - } - } else { - LOG.warn("Could not find unacked tuples for {}", entry.getKey()); - } - } - - LOG.info( - "Bulk response [{}] : items {}, waitAck {}, acked {}, failed {}", - executionId, - idsToBulkItemsWithFailedFlag.size(), - estimatedSize, - ackCount, - failureCount); - if (debugInfo != null) { - for (String kinaw : debugInfo) { - LOG.debug("Still in wait ack after bulk response [{}] => {}", executionId, kinaw); - } - } - } - - @Override - public void afterBulk(long executionId, BulkRequest request, Throwable failure) { - eventCounter.scope("bulks_received").incrBy(1); - LOG.error("Exception with bulk {} - failing the whole lot ", executionId, failure); - - final var failedIds = - request.requests().stream() - .map(DocWriteRequest::id) - .collect(Collectors.toUnmodifiableSet()); - waitAckLock.lock(); - Map> failedTupleLists; - try { - failedTupleLists = waitAck.getAllPresent(failedIds); - if (!failedTupleLists.isEmpty()) { - waitAck.invalidateAll(failedTupleLists.keySet()); - } - } finally { - waitAckLock.unlock(); - } - - for (var id : failedIds) { - var failedTuples = failedTupleLists.get(id); - if (failedTuples != null) { - LOG.debug("Failed {} tuple(s) for ID {}", failedTuples.size(), id); - for (Tuple x : failedTuples) { - // fail it - eventCounter.scope("failed").incrBy(1); - _collector.fail(x); - } - } else { - LOG.warn("Could not find unacked tuple for {}", id); - } - } - } -} diff --git a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/filtering/JSONURLFilterWrapper.java b/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/filtering/JSONURLFilterWrapper.java deleted file mode 100644 index 068875ecf..000000000 --- a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/filtering/JSONURLFilterWrapper.java +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.elasticsearch.filtering; - -import com.fasterxml.jackson.databind.JsonNode; -import java.io.ByteArrayInputStream; -import java.net.URL; -import java.util.Map; -import java.util.Timer; -import java.util.TimerTask; -import org.apache.stormcrawler.JSONResource; -import org.apache.stormcrawler.Metadata; -import org.apache.stormcrawler.elasticsearch.ElasticSearchConnection; -import org.apache.stormcrawler.filtering.URLFilter; -import org.elasticsearch.action.get.GetRequest; -import org.elasticsearch.action.get.GetResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Wraps a URLFilter whose resources are in a JSON file that can be stored in ES. The benefit of - * doing this is that the resources can be refreshed automatically and modified without having to - * recompile the jar and restart the topology. The connection to ES is done via the config and uses - * a new bolt type 'config'. - * - *

The configuration of the delegate is done in the urlfilters.json as usual. - * - *

- *  {
- *     "class": "org.apache.stormcrawler.elasticsearch.filtering.JSONURLFilterWrapper",
- *     "name": "ESFastURLFilter",
- *     "params": {
- *         "refresh": "60",
- *         "delegate": {
- *             "class": "org.apache.stormcrawler.filtering.regex.FastURLFilter",
- *             "params": {
- *                 "file": "fast.urlfilter.json"
- *             }
- *         }
- *     }
- *  }
- * 
- * - * The resource file can be pushed to ES with - * - *
- *  curl -XPUT 'localhost:9200/config/config/fast.urlfilter.json?pretty' -H 'Content-Type: application/json' -d @fast.urlfilter.json
- * 
- */ -public class JSONURLFilterWrapper extends URLFilter { - - private static final Logger LOG = LoggerFactory.getLogger(JSONURLFilterWrapper.class); - - private URLFilter delegatedURLFilter; - - public void configure(@NotNull Map stormConf, @NotNull JsonNode filterParams) { - - String urlfilterclass = null; - - JsonNode delegateNode = filterParams.get("delegate"); - if (delegateNode == null) { - throw new RuntimeException("delegateNode undefined!"); - } - - JsonNode node = delegateNode.get("class"); - if (node != null && node.isTextual()) { - urlfilterclass = node.asText(); - } - - if (urlfilterclass == null) { - throw new RuntimeException("urlfilter.class undefined!"); - } - - // load an instance of the delegated parsefilter - try { - Class filterClass = Class.forName(urlfilterclass); - - boolean subClassOK = URLFilter.class.isAssignableFrom(filterClass); - if (!subClassOK) { - throw new RuntimeException( - "Filter " + urlfilterclass + " does not extend URLFilter"); - } - - delegatedURLFilter = (URLFilter) filterClass.newInstance(); - - // check that it implements JSONResource - if (!JSONResource.class.isInstance(delegatedURLFilter)) { - throw new RuntimeException( - "Filter " + urlfilterclass + " does not implement JSONResource"); - } - - } catch (Exception e) { - LOG.error("Can't setup {}: {}", urlfilterclass, e); - throw new RuntimeException("Can't setup " + urlfilterclass, e); - } - - // configure it - node = delegateNode.get("params"); - - delegatedURLFilter.configure(stormConf, node); - - int refreshRate = 600; - - node = filterParams.get("refresh"); - if (node != null && node.isInt()) { - refreshRate = node.asInt(refreshRate); - } - - final JSONResource resource = (JSONResource) delegatedURLFilter; - - new Timer() - .schedule( - new TimerTask() { - private RestHighLevelClient esClient; - - public void run() { - if (esClient == null) { - try { - esClient = - ElasticSearchConnection.getClient( - stormConf, "config"); - } catch (Exception e) { - LOG.error("Exception while creating ES connection", e); - } - } - if (esClient != null) { - LOG.info("Reloading json resources from ES"); - try { - GetResponse response = - esClient.get( - new GetRequest( - "config", - "config", - resource.getResourceFile()), - RequestOptions.DEFAULT); - resource.loadJSONResources( - new ByteArrayInputStream( - response.getSourceAsBytes())); - } catch (Exception e) { - LOG.error("Can't load config from ES", e); - } - } - } - }, - 0, - refreshRate * 1000); - } - - @Override - public @Nullable String filter( - @Nullable URL sourceUrl, - @Nullable Metadata sourceMetadata, - @NotNull String urlToFilter) { - return delegatedURLFilter.filter(sourceUrl, sourceMetadata, urlToFilter); - } -} diff --git a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/metrics/MetricsConsumer.java b/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/metrics/MetricsConsumer.java deleted file mode 100644 index 5f2270420..000000000 --- a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/metrics/MetricsConsumer.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.elasticsearch.metrics; - -import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder; - -import java.text.SimpleDateFormat; -import java.util.Collection; -import java.util.Date; -import java.util.Iterator; -import java.util.Map; -import java.util.Map.Entry; -import org.apache.storm.metric.api.IMetricsConsumer; -import org.apache.storm.task.IErrorReporter; -import org.apache.storm.task.TopologyContext; -import org.apache.stormcrawler.elasticsearch.ElasticSearchConnection; -import org.apache.stormcrawler.util.ConfUtils; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.xcontent.XContentBuilder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Sends metrics to an Elasticsearch index. The ES details are set in the configuration; an optional - * argument sets a date format to append to the index name. - * - *
- *   topology.metrics.consumer.register:
- *        - class: "org.apache.stormcrawler.elasticsearch.metrics.MetricsConsumer"
- *          parallelism.hint: 1
- *          argument: "yyyy-MM-dd"
- * 
- */ -public class MetricsConsumer implements IMetricsConsumer { - - private final Logger LOG = LoggerFactory.getLogger(getClass()); - - private static final String ESBoltType = "metrics"; - - /** name of the index to use for the metrics (default : metrics) * */ - private static final String ESMetricsIndexNameParamName = "es." + ESBoltType + ".index.name"; - - private String indexName; - - private ElasticSearchConnection connection; - - private String stormID; - - /** optional date format passed as argument, must be parsable as a SimpleDateFormat */ - private SimpleDateFormat dateFormat; - - @Override - public void prepare( - Map stormConf, - Object registrationArgument, - TopologyContext context, - IErrorReporter errorReporter) { - indexName = ConfUtils.getString(stormConf, ESMetricsIndexNameParamName, "metrics"); - stormID = context.getStormId(); - if (registrationArgument != null) { - dateFormat = new SimpleDateFormat((String) registrationArgument); - LOG.info("Using date format {}", registrationArgument); - } - try { - connection = ElasticSearchConnection.getConnection(stormConf, ESBoltType); - } catch (Exception e1) { - LOG.error("Can't connect to ElasticSearch", e1); - throw new RuntimeException(e1); - } - } - - @Override - public void cleanup() { - if (connection != null) connection.close(); - } - - @Override - public void handleDataPoints(TaskInfo taskInfo, Collection dataPoints) { - final Date now = new Date(); - for (DataPoint dataPoint : dataPoints) { - handleDataPoints(taskInfo, dataPoint.name, dataPoint.value, now); - } - } - - private void handleDataPoints( - final TaskInfo taskInfo, final String nameprefix, final Object value, final Date now) { - if (value instanceof Number) { - indexDataPoint(taskInfo, now, nameprefix, ((Number) value).doubleValue()); - } else if (value instanceof Map) { - Iterator keyValiter = ((Map) value).entrySet().iterator(); - while (keyValiter.hasNext()) { - Entry entry = keyValiter.next(); - String newnameprefix = nameprefix + "." + entry.getKey(); - handleDataPoints(taskInfo, newnameprefix, entry.getValue(), now); - } - } else if (value instanceof Collection) { - for (Object collectionObj : (Collection) value) { - handleDataPoints(taskInfo, nameprefix, collectionObj, now); - } - } else { - LOG.warn("Found data point value {} of {}", nameprefix, value.getClass().toString()); - } - } - - /** - * Returns the name of the index that metrics will be written to. - * - * @return elastic index name - */ - private String getIndexName(Date timestamp) { - if (dateFormat == null) return indexName; - - StringBuilder sb = new StringBuilder(indexName); - sb.append("-").append(dateFormat.format(timestamp)); - return sb.toString(); - } - - private void indexDataPoint(TaskInfo taskInfo, Date timestamp, String name, double value) { - try { - XContentBuilder builder = jsonBuilder().startObject(); - builder.field("stormId", stormID); - builder.field("srcComponentId", taskInfo.srcComponentId); - builder.field("srcTaskId", taskInfo.srcTaskId); - builder.field("srcWorkerHost", taskInfo.srcWorkerHost); - builder.field("srcWorkerPort", taskInfo.srcWorkerPort); - builder.field("name", name); - builder.field("value", value); - builder.field("timestamp", timestamp); - builder.endObject(); - - IndexRequest indexRequest = new IndexRequest(getIndexName(timestamp)).source(builder); - connection.addToProcessor(indexRequest); - } catch (Exception e) { - LOG.error("problem when building request for ES", e); - } - } -} diff --git a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/metrics/StatusMetricsBolt.java b/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/metrics/StatusMetricsBolt.java deleted file mode 100644 index eeda63026..000000000 --- a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/metrics/StatusMetricsBolt.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.elasticsearch.metrics; - -import java.util.HashMap; -import java.util.Map; -import org.apache.storm.Config; -import org.apache.storm.task.OutputCollector; -import org.apache.storm.task.TopologyContext; -import org.apache.storm.topology.OutputFieldsDeclarer; -import org.apache.storm.topology.base.BaseRichBolt; -import org.apache.storm.tuple.Tuple; -import org.apache.storm.utils.TupleUtils; -import org.apache.stormcrawler.elasticsearch.ElasticSearchConnection; -import org.apache.stormcrawler.util.ConfUtils; -import org.elasticsearch.action.ActionListener; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.core.CountRequest; -import org.elasticsearch.client.core.CountResponse; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Queries the status index periodically to get the count of URLs per status. This bolt can be - * connected to the output of any other bolt and will not produce anything as output. - */ -public class StatusMetricsBolt extends BaseRichBolt { - - private static final Logger LOG = LoggerFactory.getLogger(StatusMetricsBolt.class); - - private static final String ESBoltType = "status"; - private static final String ESStatusIndexNameParamName = "es.status.index.name"; - - private String indexName; - - private ElasticSearchConnection connection; - - private Map latestStatusCounts = new HashMap<>(6); - - private int freqStats = 60; - - private OutputCollector _collector; - - private transient StatusActionListener[] listeners; - - private class StatusActionListener implements ActionListener { - - private final String name; - - private boolean ready = true; - - public boolean isReady() { - return ready; - } - - public void busy() { - this.ready = false; - } - - StatusActionListener(String statusName) { - name = statusName; - } - - @Override - public void onResponse(CountResponse response) { - ready = true; - LOG.debug("Got {} counts for status:{}", response.getCount(), name); - latestStatusCounts.put(name, response.getCount()); - } - - @Override - public void onFailure(Exception e) { - ready = true; - LOG.error("Failure when getting counts for status:{}", name, e); - } - } - - @Override - public void prepare( - Map stormConf, TopologyContext context, OutputCollector collector) { - _collector = collector; - indexName = ConfUtils.getString(stormConf, ESStatusIndexNameParamName, "status"); - try { - connection = ElasticSearchConnection.getConnection(stormConf, ESBoltType); - } catch (Exception e1) { - LOG.error("Can't connect to ElasticSearch", e1); - throw new RuntimeException(e1); - } - - context.registerMetric( - "status.count", - () -> { - return latestStatusCounts; - }, - freqStats); - - listeners = new StatusActionListener[6]; - - listeners[0] = new StatusActionListener("DISCOVERED"); - listeners[1] = new StatusActionListener("FETCHED"); - listeners[2] = new StatusActionListener("FETCH_ERROR"); - listeners[3] = new StatusActionListener("REDIRECTION"); - listeners[4] = new StatusActionListener("ERROR"); - listeners[5] = new StatusActionListener("TOTAL"); - } - - @Override - public Map getComponentConfiguration() { - Config conf = new Config(); - conf.put(Config.TOPOLOGY_TICK_TUPLE_FREQ_SECS, freqStats); - return conf; - } - - @Override - public void execute(Tuple input) { - _collector.ack(input); - - // this bolt can be connected to anything - // we just want to trigger a new search when the input is a tick tuple - if (!TupleUtils.isTick(input)) { - return; - } - - for (StatusActionListener listener : listeners) { - // still waiting for results from previous request - if (!listener.isReady()) { - LOG.debug("Not ready to get counts for status {}", listener.name); - continue; - } - CountRequest request = new CountRequest(indexName); - if (!listener.name.equalsIgnoreCase("TOTAL")) { - SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); - sourceBuilder.query(QueryBuilders.termQuery("status", listener.name)); - request.source(sourceBuilder); - } - listener.busy(); - connection.getClient().countAsync(request, RequestOptions.DEFAULT, listener); - } - } - - @Override - public void cleanup() { - connection.close(); - } - - @Override - public void declareOutputFields(OutputFieldsDeclarer declarer) { - // NONE - THIS BOLT DOES NOT GET CONNECTED TO ANY OTHERS - } -} diff --git a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/parse/filter/JSONResourceWrapper.java b/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/parse/filter/JSONResourceWrapper.java deleted file mode 100644 index 5b34830aa..000000000 --- a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/parse/filter/JSONResourceWrapper.java +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.elasticsearch.parse.filter; - -import com.fasterxml.jackson.databind.JsonNode; -import java.io.ByteArrayInputStream; -import java.util.Map; -import java.util.Timer; -import java.util.TimerTask; -import org.apache.stormcrawler.JSONResource; -import org.apache.stormcrawler.elasticsearch.ElasticSearchConnection; -import org.apache.stormcrawler.parse.ParseFilter; -import org.apache.stormcrawler.parse.ParseResult; -import org.elasticsearch.action.get.GetRequest; -import org.elasticsearch.action.get.GetResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.jetbrains.annotations.NotNull; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.DocumentFragment; - -/** - * Wraps a ParseFilter whose resources are in a JSON file that can be stored in ES. The benefit of - * doing this is that the resources can be refreshed automatically and modified without having to - * recompile the jar and restart the topology. The connection to ES is done via the config and uses - * a new bolt type 'config'. - * - *

The configuration of the delegate is done in the parsefilters.json as usual. - * - *

- *  {
- *     "class": "org.apache.stormcrawler.elasticsearch.parse.filter.JSONResourceWrapper",
- *     "name": "ESCollectionTagger",
- *     "params": {
- *         "refresh": "60",
- *         "delegate": {
- *             "class": "org.apache.stormcrawler.parse.filter.CollectionTagger",
- *             "params": {
- *                 "file": "collections.json"
- *             }
- *         }
- *     }
- *  }
- * 
- * - * The resource file can be pushed to ES with - * - *
- *  curl -XPUT "$ESHOST/config/_create/collections.json" -H 'Content-Type: application/json' -d @src/main/resources/collections.json
- * 
- */ -public class JSONResourceWrapper extends ParseFilter { - - private static final Logger LOG = LoggerFactory.getLogger(JSONResourceWrapper.class); - - private ParseFilter delegatedParseFilter; - - public void configure(@NotNull Map stormConf, @NotNull JsonNode filterParams) { - - String parsefilterclass = null; - - JsonNode delegateNode = filterParams.get("delegate"); - if (delegateNode == null) { - throw new RuntimeException("delegateNode undefined!"); - } - - JsonNode node = delegateNode.get("class"); - if (node != null && node.isTextual()) { - parsefilterclass = node.asText(); - } - - if (parsefilterclass == null) { - throw new RuntimeException("parsefilter.class undefined!"); - } - - // load an instance of the delegated parsefilter - try { - Class filterClass = Class.forName(parsefilterclass); - - boolean subClassOK = ParseFilter.class.isAssignableFrom(filterClass); - if (!subClassOK) { - throw new RuntimeException( - "Filter " + parsefilterclass + " does not extend ParseFilter"); - } - - delegatedParseFilter = (ParseFilter) filterClass.newInstance(); - - // check that it implements JSONResource - if (!JSONResource.class.isInstance(delegatedParseFilter)) { - throw new RuntimeException( - "Filter " + parsefilterclass + " does not implement JSONResource"); - } - - } catch (Exception e) { - LOG.error("Can't setup {}: {}", parsefilterclass, e); - throw new RuntimeException("Can't setup " + parsefilterclass, e); - } - - // configure it - node = delegateNode.get("params"); - - delegatedParseFilter.configure(stormConf, node); - - int refreshRate = 600; - - node = filterParams.get("refresh"); - if (node != null && node.isInt()) { - refreshRate = node.asInt(refreshRate); - } - - final JSONResource resource = (JSONResource) delegatedParseFilter; - - new Timer() - .schedule( - new TimerTask() { - private RestHighLevelClient esClient; - - public void run() { - if (esClient == null) { - try { - esClient = - ElasticSearchConnection.getClient( - stormConf, "config"); - } catch (Exception e) { - LOG.error("Exception while creating ES connection", e); - } - } - if (esClient != null) { - LOG.info("Reloading json resources from ES"); - try { - GetResponse response = - esClient.get( - new GetRequest( - "config", - resource.getResourceFile()), - RequestOptions.DEFAULT); - resource.loadJSONResources( - new ByteArrayInputStream( - response.getSourceAsBytes())); - } catch (Exception e) { - LOG.error("Can't load config from ES", e); - } - } - } - }, - 0, - refreshRate * 1000); - } - - @Override - public void filter(String URL, byte[] content, DocumentFragment doc, ParseResult parse) { - delegatedParseFilter.filter(URL, content, doc, parse); - } -} diff --git a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/AbstractSpout.java b/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/AbstractSpout.java deleted file mode 100644 index ec5b32c2e..000000000 --- a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/AbstractSpout.java +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.elasticsearch.persistence; - -import java.io.IOException; -import java.util.Date; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import org.apache.storm.spout.SpoutOutputCollector; -import org.apache.storm.task.TopologyContext; -import org.apache.stormcrawler.Metadata; -import org.apache.stormcrawler.elasticsearch.ElasticSearchConnection; -import org.apache.stormcrawler.persistence.AbstractQueryingSpout; -import org.apache.stormcrawler.util.ConfUtils; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.search.SearchHit; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public abstract class AbstractSpout extends AbstractQueryingSpout { - - private static final Logger LOG = LoggerFactory.getLogger(AbstractSpout.class); - - protected static final String ESBoltType = "status"; - protected static final String ESStatusIndexNameParamName = "es.status.index.name"; - - /** Field name to use for aggregating * */ - protected static final String ESStatusBucketFieldParamName = "es.status.bucket.field"; - - protected static final String ESStatusMaxBucketParamName = "es.status.max.buckets"; - protected static final String ESStatusMaxURLsParamName = "es.status.max.urls.per.bucket"; - - /** Field name to use for sorting the URLs within a bucket, not used if empty or null. */ - protected static final String ESStatusBucketSortFieldParamName = "es.status.bucket.sort.field"; - - /** Field name to use for sorting the buckets, not used if empty or null. */ - protected static final String ESStatusGlobalSortFieldParamName = "es.status.global.sort.field"; - - protected static final String ESStatusFilterParamName = "es.status.filterQuery"; - - protected static final String ESStatusQueryTimeoutParamName = "es.status.query.timeout"; - - /** Query to use as a positive filter, set by es.status.filterQuery */ - protected List filterQueries = null; - - protected String indexName; - - protected static RestHighLevelClient client; - - /** - * when using multiple instances - each one is in charge of a specific shard useful when - * sharding based on host or domain to guarantee a good mix of URLs - */ - protected int shardID = -1; - - /** Used to distinguish between instances in the logs * */ - protected String logIdprefix = ""; - - /** Field name used for field collapsing e.g. key * */ - protected String partitionField; - - protected int maxURLsPerBucket = 10; - - protected int maxBucketNum = 10; - - protected List bucketSortField = new LinkedList<>(); - - protected String totalSortField = ""; - - protected Date queryDate; - - protected int queryTimeout = -1; - - @Override - public void open( - Map stormConf, - TopologyContext context, - SpoutOutputCollector collector) { - - super.open(stormConf, context, collector); - - indexName = ConfUtils.getString(stormConf, ESStatusIndexNameParamName, "status"); - - // one ES client per JVM - synchronized (AbstractSpout.class) { - try { - if (client == null) { - client = ElasticSearchConnection.getClient(stormConf, ESBoltType); - } - } catch (Exception e1) { - LOG.error("Can't connect to ElasticSearch", e1); - throw new RuntimeException(e1); - } - } - - // if more than one instance is used we expect their number to be the - // same as the number of shards - int totalTasks = context.getComponentTasks(context.getThisComponentId()).size(); - if (totalTasks > 1) { - logIdprefix = - "[" + context.getThisComponentId() + " #" + context.getThisTaskIndex() + "] "; - - // determine the number of shards so that we can restrict the - // search - - // TODO use the admin API when it gets available - // TODO or the low level one with - // https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-shards-stores.html - // TODO identify local shards and use those if possible - - // ClusterSearchShardsRequest request = new - // ClusterSearchShardsRequest( - // indexName); - // ClusterSearchShardsResponse shardresponse = client.admin() - // .cluster().searchShards(request).actionGet(); - // ClusterSearchShardsGroup[] shardgroups = - // shardresponse.getGroups(); - // if (totalTasks != shardgroups.length) { - // throw new RuntimeException( - // "Number of ES spout instances should be the same as number of - // shards (" - // + shardgroups.length + ") but is " + totalTasks); - // } - // shardID = shardgroups[context.getThisTaskIndex()].getShardId() - // .getId(); - - // TEMPORARY simply use the task index as shard index - shardID = context.getThisTaskIndex(); - LOG.info("{} assigned shard ID {}", logIdprefix, shardID); - } - - partitionField = ConfUtils.getString(stormConf, ESStatusBucketFieldParamName, "key"); - - bucketSortField = ConfUtils.loadListFromConf(ESStatusBucketSortFieldParamName, stormConf); - - totalSortField = ConfUtils.getString(stormConf, ESStatusGlobalSortFieldParamName); - - maxURLsPerBucket = ConfUtils.getInt(stormConf, ESStatusMaxURLsParamName, 1); - maxBucketNum = ConfUtils.getInt(stormConf, ESStatusMaxBucketParamName, 10); - - queryTimeout = ConfUtils.getInt(stormConf, ESStatusQueryTimeoutParamName, -1); - - filterQueries = ConfUtils.loadListFromConf(ESStatusFilterParamName, stormConf); - } - - /** Builds a query and use it retrieve the results from ES * */ - protected abstract void populateBuffer(); - - protected final boolean addHitToBuffer(SearchHit hit) { - Map keyValues = hit.getSourceAsMap(); - String url = (String) keyValues.get("url"); - // is already being processed - skip it! - if (beingProcessed.containsKey(url)) { - return false; - } - Metadata metadata = fromKeyValues(keyValues); - addHitInfoToMetadata(metadata, hit); - - return buffer.add(url, metadata); - } - - protected void addHitInfoToMetadata(Metadata metadata, SearchHit hit) {} - - protected final Metadata fromKeyValues(Map keyValues) { - Map> mdAsMap = (Map>) keyValues.get("metadata"); - Metadata metadata = new Metadata(); - if (mdAsMap != null) { - Iterator>> mdIter = mdAsMap.entrySet().iterator(); - while (mdIter.hasNext()) { - Entry> mdEntry = mdIter.next(); - String key = mdEntry.getKey(); - // periods are not allowed in ES2 - replace with %2E - key = key.replaceAll("%2E", "\\."); - Object mdValObj = mdEntry.getValue(); - // single value - if (mdValObj instanceof String) { - metadata.addValue(key, (String) mdValObj); - } - // multi valued - else { - metadata.addValues(key, (List) mdValObj); - } - } - } - return metadata; - } - - @Override - public void ack(Object msgId) { - LOG.debug("{} Ack for {}", logIdprefix, msgId); - super.ack(msgId); - } - - @Override - public void fail(Object msgId) { - LOG.info("{} Fail for {}", logIdprefix, msgId); - super.fail(msgId); - } - - @Override - public void close() { - if (client != null) - try { - client.close(); - } catch (IOException e) { - } - } -} diff --git a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/AggregationSpout.java b/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/AggregationSpout.java deleted file mode 100644 index 0e1f69dae..000000000 --- a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/AggregationSpout.java +++ /dev/null @@ -1,364 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.elasticsearch.persistence; - -import static org.elasticsearch.index.query.QueryBuilders.boolQuery; - -import java.time.Instant; -import java.util.Calendar; -import java.util.Date; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.TimeUnit; -import org.apache.commons.lang.StringUtils; -import org.apache.storm.spout.SpoutOutputCollector; -import org.apache.storm.task.TopologyContext; -import org.apache.stormcrawler.Metadata; -import org.apache.stormcrawler.util.ConfUtils; -import org.elasticsearch.action.ActionListener; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.core.TimeValue; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.aggregations.AggregationBuilders; -import org.elasticsearch.search.aggregations.Aggregations; -import org.elasticsearch.search.aggregations.BucketOrder; -import org.elasticsearch.search.aggregations.bucket.SingleBucketAggregation; -import org.elasticsearch.search.aggregations.bucket.sampler.DiversifiedAggregationBuilder; -import org.elasticsearch.search.aggregations.bucket.terms.Terms; -import org.elasticsearch.search.aggregations.bucket.terms.Terms.Bucket; -import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder; -import org.elasticsearch.search.aggregations.metrics.TopHits; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.sort.FieldSortBuilder; -import org.elasticsearch.search.sort.SortBuilders; -import org.elasticsearch.search.sort.SortOrder; -import org.joda.time.format.ISODateTimeFormat; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Spout which pulls URL from an ES index. Use a single instance unless you use 'es.status.routing' - * with the StatusUpdaterBolt, in which case you need to have exactly the same number of spout - * instances as ES shards. Guarantees a good mix of URLs by aggregating them by an arbitrary field - * e.g. key. - */ -public class AggregationSpout extends AbstractSpout implements ActionListener { - - private static final Logger LOG = LoggerFactory.getLogger(AggregationSpout.class); - - private static final String ESStatusSampleParamName = "es.status.sample"; - private static final String ESMostRecentDateIncreaseParamName = "es.status.recentDate.increase"; - private static final String ESMostRecentDateMinGapParamName = "es.status.recentDate.min.gap"; - - private boolean sample = false; - - private int recentDateIncrease = -1; - private int recentDateMinGap = -1; - - protected Set currentBuckets; - - @Override - public void open( - Map stormConf, - TopologyContext context, - SpoutOutputCollector collector) { - sample = ConfUtils.getBoolean(stormConf, ESStatusSampleParamName, sample); - recentDateIncrease = - ConfUtils.getInt(stormConf, ESMostRecentDateIncreaseParamName, recentDateIncrease); - recentDateMinGap = - ConfUtils.getInt(stormConf, ESMostRecentDateMinGapParamName, recentDateMinGap); - super.open(stormConf, context, collector); - currentBuckets = new HashSet<>(); - } - - @Override - protected void populateBuffer() { - - if (queryDate == null) { - queryDate = new Date(); - lastTimeResetToNOW = Instant.now(); - } - - String formattedQueryDate = ISODateTimeFormat.dateTimeNoMillis().print(queryDate.getTime()); - - LOG.info("{} Populating buffer with nextFetchDate <= {}", logIdprefix, formattedQueryDate); - - BoolQueryBuilder queryBuilder = - boolQuery() - .filter(QueryBuilders.rangeQuery("nextFetchDate").lte(formattedQueryDate)); - - if (filterQueries != null) { - for (String filterQuery : filterQueries) { - queryBuilder.filter(QueryBuilders.queryStringQuery(filterQuery)); - } - } - - SearchRequest request = new SearchRequest(indexName); - - SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); - sourceBuilder.query(queryBuilder); - sourceBuilder.from(0); - sourceBuilder.size(0); - sourceBuilder.explain(false); - sourceBuilder.trackTotalHits(false); - - if (queryTimeout != -1) { - sourceBuilder.timeout(new TimeValue(queryTimeout, TimeUnit.SECONDS)); - } - - TermsAggregationBuilder aggregations = - AggregationBuilders.terms("partition").field(partitionField).size(maxBucketNum); - - org.elasticsearch.search.aggregations.metrics.TopHitsAggregationBuilder tophits = - AggregationBuilders.topHits("docs").size(maxURLsPerBucket).explain(false); - - // sort within a bucket - for (String bsf : bucketSortField) { - FieldSortBuilder sorter = SortBuilders.fieldSort(bsf).order(SortOrder.ASC); - tophits.sort(sorter); - } - - aggregations.subAggregation(tophits); - - // sort between buckets - if (StringUtils.isNotBlank(totalSortField)) { - org.elasticsearch.search.aggregations.metrics.MinAggregationBuilder minBuilder = - AggregationBuilders.min("top_hit").field(totalSortField); - aggregations.subAggregation(minBuilder); - aggregations.order(BucketOrder.aggregation("top_hit", true)); - } - - if (sample) { - DiversifiedAggregationBuilder sab = new DiversifiedAggregationBuilder("sample"); - sab.field(partitionField).maxDocsPerValue(maxURLsPerBucket); - sab.shardSize(maxURLsPerBucket * maxBucketNum); - sab.subAggregation(aggregations); - sourceBuilder.aggregation(sab); - } else { - sourceBuilder.aggregation(aggregations); - } - - request.source(sourceBuilder); - - // https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-preference.html - // _shards:2,3 - // specific shard but ideally a local copy of it - if (shardID != -1) { - request.preference("_shards:" + shardID + "|_local"); - } - - // dump query to log - LOG.debug("{} ES query {}", logIdprefix, request); - - LOG.trace("{} isInquery set to true"); - isInQuery.set(true); - client.searchAsync(request, RequestOptions.DEFAULT, this); - } - - @Override - public void onFailure(Exception arg0) { - LOG.error("{} Exception with ES query", logIdprefix, arg0); - markQueryReceivedNow(); - } - - @Override - public void onResponse(SearchResponse response) { - long timeTaken = System.currentTimeMillis() - getTimeLastQuerySent(); - - Aggregations aggregs = response.getAggregations(); - - if (aggregs == null) { - markQueryReceivedNow(); - return; - } - - SingleBucketAggregation sample = aggregs.get("sample"); - if (sample != null) { - aggregs = sample.getAggregations(); - } - - Terms agg = aggregs.get("partition"); - - int numhits = 0; - int numBuckets = 0; - int alreadyprocessed = 0; - - Instant mostRecentDateFound = null; - - currentBuckets.clear(); - - // For each entry - Iterator iterator = (Iterator) agg.getBuckets().iterator(); - while (iterator.hasNext()) { - Terms.Bucket entry = iterator.next(); - String key = (String) entry.getKey(); // bucket key - - currentBuckets.add(key); - - long docCount = entry.getDocCount(); // Doc count - - int hitsForThisBucket = 0; - - SearchHit lastHit = null; - - // filter results so that we don't include URLs we are already - // being processed - TopHits topHits = entry.getAggregations().get("docs"); - for (SearchHit hit : topHits.getHits().getHits()) { - - LOG.debug( - "{} -> id [{}], _source [{}]", - logIdprefix, - hit.getId(), - hit.getSourceAsString()); - - hitsForThisBucket++; - - lastHit = hit; - - Map keyValues = hit.getSourceAsMap(); - String url = (String) keyValues.get("url"); - - // consider only the first document of the last bucket - // for optimising the nextFetchDate - if (hitsForThisBucket == 1 && !iterator.hasNext()) { - String strDate = (String) keyValues.get("nextFetchDate"); - try { - mostRecentDateFound = Instant.parse(strDate); - } catch (Exception e) { - throw new RuntimeException("can't parse date :" + strDate); - } - } - - // is already being processed or in buffer - skip it! - if (beingProcessed.containsKey(url)) { - LOG.debug("{} -> already processed: {}", logIdprefix, url); - alreadyprocessed++; - continue; - } - - Metadata metadata = fromKeyValues(keyValues); - boolean added = buffer.add(url, metadata); - if (!added) { - LOG.debug("{} -> already in buffer: {}", logIdprefix, url); - alreadyprocessed++; - continue; - } - LOG.debug("{} -> added to buffer : {}", logIdprefix, url); - } - - if (lastHit != null) { - sortValuesForKey(key, lastHit.getSortValues()); - } - - if (hitsForThisBucket > 0) numBuckets++; - - numhits += hitsForThisBucket; - - LOG.debug( - "{} key [{}], hits[{}], doc_count [{}]", - logIdprefix, - key, - hitsForThisBucket, - docCount, - alreadyprocessed); - } - - LOG.info( - "{} ES query returned {} hits from {} buckets in {} msec with {} already being processed. Took {} msec per doc on average.", - logIdprefix, - numhits, - numBuckets, - timeTaken, - alreadyprocessed, - ((float) timeTaken / numhits)); - - queryTimes.addMeasurement(timeTaken); - eventCounter.scope("already_being_processed").incrBy(alreadyprocessed); - eventCounter.scope("ES_queries").incrBy(1); - eventCounter.scope("ES_docs").incrBy(numhits); - - // optimise the nextFetchDate by getting the most recent value - // returned in the query and add to it, unless the previous value is - // within n mins in which case we'll keep it - if (mostRecentDateFound != null && recentDateIncrease >= 0) { - Calendar potentialNewDate = Calendar.getInstance(); - potentialNewDate.setTimeInMillis(mostRecentDateFound.toEpochMilli()); - potentialNewDate.add(Calendar.MINUTE, recentDateIncrease); - Date oldDate = null; - // check boundaries - if (this.recentDateMinGap > 0) { - Calendar low = Calendar.getInstance(); - low.setTime(queryDate); - low.add(Calendar.MINUTE, -recentDateMinGap); - Calendar high = Calendar.getInstance(); - high.setTime(queryDate); - high.add(Calendar.MINUTE, recentDateMinGap); - if (high.before(potentialNewDate) || low.after(potentialNewDate)) { - oldDate = queryDate; - } - } else { - oldDate = queryDate; - } - if (oldDate != null) { - queryDate = potentialNewDate.getTime(); - LOG.info( - "{} queryDate changed from {} to {} based on mostRecentDateFound {}", - logIdprefix, - oldDate, - queryDate, - mostRecentDateFound); - } else { - LOG.info( - "{} queryDate kept at {} based on mostRecentDateFound {}", - logIdprefix, - queryDate, - mostRecentDateFound); - } - } - - // reset the value for next fetch date if the previous one is too old - if (resetFetchDateAfterNSecs != -1) { - Instant changeNeededOn = - Instant.ofEpochMilli( - lastTimeResetToNOW.toEpochMilli() + (resetFetchDateAfterNSecs * 1000)); - if (Instant.now().isAfter(changeNeededOn)) { - LOG.info( - "{} queryDate set to null based on resetFetchDateAfterNSecs {}", - logIdprefix, - resetFetchDateAfterNSecs); - queryDate = null; - } - } - - // change the date if we don't get any results at all - if (numBuckets == 0) { - queryDate = null; - } - - // remove lock - markQueryReceivedNow(); - } - - protected void sortValuesForKey(String key, Object[] sortValues) {} -} diff --git a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/CollapsingSpout.java b/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/CollapsingSpout.java deleted file mode 100644 index badcf7aa2..000000000 --- a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/CollapsingSpout.java +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.elasticsearch.persistence; - -import static org.elasticsearch.index.query.QueryBuilders.boolQuery; - -import java.time.Instant; -import java.util.Date; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.concurrent.TimeUnit; -import org.apache.commons.lang.StringUtils; -import org.apache.storm.spout.SpoutOutputCollector; -import org.apache.storm.task.TopologyContext; -import org.apache.stormcrawler.util.ConfUtils; -import org.elasticsearch.action.ActionListener; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.core.TimeValue; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.InnerHitBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.SearchHits; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.collapse.CollapseBuilder; -import org.elasticsearch.search.sort.FieldSortBuilder; -import org.elasticsearch.search.sort.SortBuilder; -import org.elasticsearch.search.sort.SortBuilders; -import org.elasticsearch.search.sort.SortOrder; -import org.joda.time.format.ISODateTimeFormat; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Spout which pulls URL from an ES index. Use a single instance unless you use 'es.status.routing' - * with the StatusUpdaterBolt, in which case you need to have exactly the same number of spout - * instances as ES shards. Collapses results to implement politeness and ensure a good diversity of - * sources. - */ -public class CollapsingSpout extends AbstractSpout implements ActionListener { - - private static final Logger LOG = LoggerFactory.getLogger(CollapsingSpout.class); - - /** Used to avoid deep paging * */ - private static final String ESMaxStartOffsetParamName = "es.status.max.start.offset"; - - private int lastStartOffset = 0; - private int maxStartOffset = -1; - - @Override - public void open( - Map stormConf, - TopologyContext context, - SpoutOutputCollector collector) { - maxStartOffset = ConfUtils.getInt(stormConf, ESMaxStartOffsetParamName, -1); - super.open(stormConf, context, collector); - } - - @Override - protected void populateBuffer() { - // not used yet or returned empty results - if (queryDate == null) { - queryDate = new Date(); - lastTimeResetToNOW = Instant.now(); - lastStartOffset = 0; - } - // been running same query for too long and paging deep? - else if (maxStartOffset != -1 && lastStartOffset > maxStartOffset) { - LOG.info("Reached max start offset {}", lastStartOffset); - lastStartOffset = 0; - } - - String formattedLastDate = ISODateTimeFormat.dateTimeNoMillis().print(queryDate.getTime()); - - LOG.info("{} Populating buffer with nextFetchDate <= {}", logIdprefix, formattedLastDate); - - BoolQueryBuilder queryBuilder = - boolQuery() - .filter(QueryBuilders.rangeQuery("nextFetchDate").lte(formattedLastDate)); - - if (filterQueries != null) { - for (String filterQuery : filterQueries) { - queryBuilder.filter(QueryBuilders.queryStringQuery(filterQuery)); - } - } - - SearchRequest request = new SearchRequest(indexName); - - SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); - sourceBuilder.query(queryBuilder); - sourceBuilder.from(lastStartOffset); - sourceBuilder.size(maxBucketNum); - sourceBuilder.explain(false); - sourceBuilder.trackTotalHits(false); - - // https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-preference.html - // _shards:2,3 - // specific shard but ideally a local copy of it - if (shardID != -1) { - request.preference("_shards:" + shardID + "|_local"); - } - - if (queryTimeout != -1) { - sourceBuilder.timeout(new TimeValue(queryTimeout, TimeUnit.SECONDS)); - } - - if (StringUtils.isNotBlank(totalSortField)) { - sourceBuilder.sort(new FieldSortBuilder(totalSortField).order(SortOrder.ASC)); - } - - CollapseBuilder collapse = new CollapseBuilder(partitionField); - - // group expansion -> sends sub queries for each bucket - if (maxURLsPerBucket > 1) { - InnerHitBuilder ihb = new InnerHitBuilder(); - ihb.setSize(maxURLsPerBucket); - ihb.setName("urls_per_bucket"); - List> sorts = new LinkedList<>(); - // sort within a bucket - for (String bsf : bucketSortField) { - FieldSortBuilder bucketsorter = SortBuilders.fieldSort(bsf).order(SortOrder.ASC); - sorts.add(bucketsorter); - } - if (!sorts.isEmpty()) { - ihb.setSorts(sorts); - } - collapse.setInnerHits(ihb); - } - - sourceBuilder.collapse(collapse); - - request.source(sourceBuilder); - - // dump query to log - LOG.debug("{} ES query {}", logIdprefix, request.toString()); - - isInQuery.set(true); - client.searchAsync(request, RequestOptions.DEFAULT, this); - } - - @Override - public void onFailure(Exception e) { - LOG.error("{} Exception with ES query", logIdprefix, e); - markQueryReceivedNow(); - } - - @Override - public void onResponse(SearchResponse response) { - long timeTaken = System.currentTimeMillis() - getTimeLastQuerySent(); - - SearchHit[] hits = response.getHits().getHits(); - int numBuckets = hits.length; - - int alreadyprocessed = 0; - int numDocs = 0; - - for (SearchHit hit : hits) { - Map innerHits = hit.getInnerHits(); - // wanted just one per bucket : no inner hits - if (innerHits == null) { - numDocs++; - if (!addHitToBuffer(hit)) { - alreadyprocessed++; - } - continue; - } - // more than one per bucket - SearchHits inMyBucket = innerHits.get("urls_per_bucket"); - for (SearchHit subHit : inMyBucket.getHits()) { - numDocs++; - if (!addHitToBuffer(subHit)) { - alreadyprocessed++; - } - } - } - - queryTimes.addMeasurement(timeTaken); - // could be derived from the count of query times above - eventCounter.scope("ES_queries").incrBy(1); - eventCounter.scope("ES_docs").incrBy(numDocs); - eventCounter.scope("already_being_processed").incrBy(alreadyprocessed); - - LOG.info( - "{} ES query returned {} hits from {} buckets in {} msec with {} already being processed.Took {} msec per doc on average.", - logIdprefix, - numDocs, - numBuckets, - timeTaken, - alreadyprocessed, - ((float) timeTaken / numDocs)); - - // reset the value for next fetch date if the previous one is too old - if (resetFetchDateAfterNSecs != -1) { - Instant changeNeededOn = - Instant.ofEpochMilli( - lastTimeResetToNOW.toEpochMilli() + (resetFetchDateAfterNSecs * 1000)); - if (Instant.now().isAfter(changeNeededOn)) { - LOG.info( - "queryDate reset based on resetFetchDateAfterNSecs {}", - resetFetchDateAfterNSecs); - queryDate = null; - lastStartOffset = 0; - } - } - - // no more results? - if (numBuckets == 0) { - queryDate = null; - lastStartOffset = 0; - } - // still got some results but paging won't help - else if (numBuckets < maxBucketNum) { - lastStartOffset = 0; - } else { - lastStartOffset += numBuckets; - } - - // remove lock - markQueryReceivedNow(); - } -} diff --git a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/HybridSpout.java b/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/HybridSpout.java deleted file mode 100644 index 8e16a5337..000000000 --- a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/HybridSpout.java +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.elasticsearch.persistence; - -import static org.elasticsearch.index.query.QueryBuilders.boolQuery; - -import com.github.benmanes.caffeine.cache.Cache; -import com.github.benmanes.caffeine.cache.Caffeine; -import java.time.Instant; -import java.util.Date; -import java.util.List; -import java.util.Map; -import org.apache.storm.spout.SpoutOutputCollector; -import org.apache.storm.task.TopologyContext; -import org.apache.stormcrawler.persistence.EmptyQueueListener; -import org.apache.stormcrawler.util.ConfUtils; -import org.elasticsearch.action.ActionListener; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.index.query.BoolQueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.elasticsearch.search.sort.FieldSortBuilder; -import org.elasticsearch.search.sort.SortBuilders; -import org.elasticsearch.search.sort.SortOrder; -import org.joda.time.format.ISODateTimeFormat; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Uses collapsing spouts to get an initial set of URLs and keys to query for and gets emptyQueue - * notifications from the URLBuffer to query ES for a specific key. - * - * @since 1.15 - */ -public class HybridSpout extends AggregationSpout implements EmptyQueueListener { - - private static final Logger LOG = LoggerFactory.getLogger(HybridSpout.class); - - protected static final String RELOADPARAMNAME = "es.status.max.urls.per.reload"; - - private int bufferReloadSize = 10; - - private Cache searchAfterCache; - - private HostResultListener hrl; - - @Override - public void open( - Map stormConf, - TopologyContext context, - SpoutOutputCollector collector) { - super.open(stormConf, context, collector); - bufferReloadSize = ConfUtils.getInt(stormConf, RELOADPARAMNAME, maxURLsPerBucket); - buffer.setEmptyQueueListener(this); - searchAfterCache = Caffeine.newBuilder().build(); - hrl = new HostResultListener(); - } - - @Override - public void emptyQueue(String queueName) { - - LOG.info("{} Emptied buffer queue for {}", logIdprefix, queueName); - - if (!currentBuckets.contains(queueName)) { - // not interested in this one any more - return; - } - - // reloading the aggregs - searching now - // would just overload ES and yield - // mainly duplicates - if (isInQuery.get()) { - LOG.trace("{} isInquery true", logIdprefix, queueName); - return; - } - - LOG.info("{} Querying for more docs for {}", logIdprefix, queueName); - - if (queryDate == null) { - queryDate = new Date(); - lastTimeResetToNOW = Instant.now(); - } - - String formattedQueryDate = ISODateTimeFormat.dateTimeNoMillis().print(queryDate.getTime()); - - BoolQueryBuilder queryBuilder = - boolQuery() - .filter(QueryBuilders.rangeQuery("nextFetchDate").lte(formattedQueryDate)); - - queryBuilder.filter(QueryBuilders.termQuery(partitionField, queueName)); - - SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); - sourceBuilder.query(queryBuilder); - sourceBuilder.from(0); - sourceBuilder.size(bufferReloadSize); - sourceBuilder.explain(false); - sourceBuilder.trackTotalHits(false); - - // sort within a bucket - for (String bsf : bucketSortField) { - FieldSortBuilder sorter = SortBuilders.fieldSort(bsf).order(SortOrder.ASC); - sourceBuilder.sort(sorter); - } - - // do we have a search after for this one? - Object[] searchAfterValues = searchAfterCache.getIfPresent(queueName); - if (searchAfterValues != null) { - sourceBuilder.searchAfter(searchAfterValues); - } - - SearchRequest request = new SearchRequest(indexName); - - request.source(sourceBuilder); - - // https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-preference.html - // _shards:2,3 - // specific shard but ideally a local copy of it - if (shardID != -1) { - request.preference("_shards:" + shardID + "|_local"); - } - - // dump query to log - LOG.debug("{} ES query {} - {}", logIdprefix, queueName, request.toString()); - - client.searchAsync(request, RequestOptions.DEFAULT, hrl); - } - - @Override - /** Overrides the handling of responses for aggregations */ - public void onResponse(SearchResponse response) { - // delete all entries from the searchAfterCache when - // we get the results from the aggregation spouts - searchAfterCache.invalidateAll(); - super.onResponse(response); - } - - @Override - /** The aggregation kindly told us where to start from * */ - protected void sortValuesForKey(String key, Object[] sortValues) { - if (sortValues != null && sortValues.length > 0) this.searchAfterCache.put(key, sortValues); - } - - /** Handling of results for a specific queue * */ - class HostResultListener implements ActionListener { - - @Override - public void onResponse(SearchResponse response) { - - int alreadyprocessed = 0; - int numDocs = 0; - - SearchHit[] hits = response.getHits().getHits(); - - Object[] sortValues = null; - - // retrieve the key for these results - String key = null; - - for (SearchHit hit : hits) { - numDocs++; - String pfield = partitionField; - Map sourceAsMap = hit.getSourceAsMap(); - if (pfield.startsWith("metadata.")) { - sourceAsMap = (Map) sourceAsMap.get("metadata"); - pfield = pfield.substring(9); - } - Object key_as_object = sourceAsMap.get(pfield); - if (key_as_object instanceof List) { - if (((List) (key_as_object)).size() == 1) - key = (String) ((List) key_as_object).get(0); - } else { - key = key_as_object.toString(); - } - - sortValues = hit.getSortValues(); - if (!addHitToBuffer(hit)) { - alreadyprocessed++; - } - } - - // no key if no results have been found - if (key != null) { - searchAfterCache.put(key, sortValues); - } - - eventCounter.scope("ES_queries_host").incrBy(1); - eventCounter.scope("ES_docs_host").incrBy(numDocs); - eventCounter.scope("already_being_processed_host").incrBy(alreadyprocessed); - - LOG.info( - "{} ES term query returned {} hits in {} msec with {} already being processed for {}", - logIdprefix, - numDocs, - response.getTook().getMillis(), - alreadyprocessed, - key); - } - - @Override - public void onFailure(Exception e) { - LOG.error("Exception with ES query", e); - } - } -} diff --git a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/ScrollSpout.java b/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/ScrollSpout.java deleted file mode 100644 index 1f44a01f7..000000000 --- a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/ScrollSpout.java +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.elasticsearch.persistence; - -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Queue; -import org.apache.storm.topology.OutputFieldsDeclarer; -import org.apache.storm.tuple.Fields; -import org.apache.storm.tuple.Values; -import org.apache.storm.utils.Utils; -import org.apache.stormcrawler.Constants; -import org.apache.stormcrawler.Metadata; -import org.apache.stormcrawler.persistence.AbstractStatusUpdaterBolt; -import org.apache.stormcrawler.persistence.Status; -import org.elasticsearch.action.ActionListener; -import org.elasticsearch.action.search.SearchRequest; -import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.action.search.SearchScrollRequest; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.core.TimeValue; -import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.search.SearchHit; -import org.elasticsearch.search.SearchHits; -import org.elasticsearch.search.builder.SearchSourceBuilder; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Reads all the documents from a shard and emits them on the status stream. Used for copying an - * index. - */ -public class ScrollSpout extends AbstractSpout implements ActionListener { - - private String scrollId = null; - private boolean hasFinished = false; - - private Queue queue = new LinkedList<>(); - - private static final Logger LOG = LoggerFactory.getLogger(ScrollSpout.class); - - @Override - // simplified version of the super method so that we can store the fields in - // the - // map of things being processed - public void nextTuple() { - synchronized (queue) { - if (!queue.isEmpty()) { - List fields = queue.remove(); - String url = fields.get(0).toString(); - _collector.emit(Constants.StatusStreamName, fields, url); - beingProcessed.put(url, fields); - eventCounter.scope("emitted").incrBy(1); - LOG.debug("{} emitted {}", logIdprefix, url); - return; - } - } - - if (isInQuery.get()) { - LOG.trace("{} isInquery true", logIdprefix); - // sleep for a bit but not too much in order to give ack/fail a - // chance - Utils.sleep(10); - return; - } - - // re-populate the buffer - populateBuffer(); - } - - @Override - protected void populateBuffer() { - if (hasFinished) { - Utils.sleep(10); - return; - } - - // initial request - if (scrollId == null) { - SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); - searchSourceBuilder.query(QueryBuilders.matchAllQuery()); - searchSourceBuilder.size(maxURLsPerBucket * maxBucketNum); - SearchRequest searchRequest = new SearchRequest(indexName); - searchRequest.source(searchSourceBuilder); - searchRequest.scroll(TimeValue.timeValueMinutes(5L)); - - // specific shard but ideally a local copy of it - if (shardID != -1) { - searchRequest.preference("_shards:" + shardID + "|_local"); - } - - isInQuery.set(true); - LOG.trace("{} isInquery set to true", logIdprefix); - - client.searchAsync(searchRequest, RequestOptions.DEFAULT, this); - - // dump query to log - LOG.debug("{} ES query {}", logIdprefix, searchRequest.toString()); - return; - } - - SearchScrollRequest scrollRequest = new SearchScrollRequest(scrollId); - scrollRequest.scroll(TimeValue.timeValueMinutes(5L)); - - isInQuery.set(true); - client.scrollAsync(scrollRequest, RequestOptions.DEFAULT, this); - // dump query to log - LOG.debug("{} ES query {}", logIdprefix, scrollRequest.toString()); - } - - @Override - public void onResponse(SearchResponse response) { - SearchHits hits = response.getHits(); - LOG.info( - "{} ES query returned {} hits in {} msec", - logIdprefix, - hits.getHits().length, - response.getTook().getMillis()); - hasFinished = hits.getHits().length == 0; - synchronized (this.queue) { - // Unlike standard spouts, the scroll queries should never return - // the same - // document twice -> no need to look in the buffer or cache - for (SearchHit hit : hits) { - Map keyValues = hit.getSourceAsMap(); - String url = (String) keyValues.get("url"); - String status = (String) keyValues.get("status"); - String nextFetchDate = (String) keyValues.get("nextFetchDate"); - Metadata metadata = fromKeyValues(keyValues); - metadata.setValue( - AbstractStatusUpdaterBolt.AS_IS_NEXTFETCHDATE_METADATA, nextFetchDate); - this.queue.add(new Values(url, metadata, Status.valueOf(status))); - } - } - scrollId = response.getScrollId(); - // remove lock - markQueryReceivedNow(); - } - - @Override - public void onFailure(Exception e) { - LOG.error("{} Exception with ES query", logIdprefix, e); - markQueryReceivedNow(); - } - - @Override - public void fail(Object msgId) { - LOG.info("{} Fail for {}", logIdprefix, msgId); - eventCounter.scope("failed").incrBy(1); - // retrieve the values from being processed and send them back to the - // queue - Values v = (Values) beingProcessed.remove(msgId); - queue.add(v); - } - - @Override - public void declareOutputFields(OutputFieldsDeclarer declarer) { - declarer.declareStream(Constants.StatusStreamName, new Fields("url", "metadata", "status")); - } -} diff --git a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/StatusUpdaterBolt.java b/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/StatusUpdaterBolt.java deleted file mode 100644 index 3e10348c5..000000000 --- a/external/elasticsearch/src/main/java/org/apache/stormcrawler/elasticsearch/persistence/StatusUpdaterBolt.java +++ /dev/null @@ -1,451 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.elasticsearch.persistence; - -import com.github.benmanes.caffeine.cache.Cache; -import com.github.benmanes.caffeine.cache.Caffeine; -import com.github.benmanes.caffeine.cache.RemovalCause; -import com.github.benmanes.caffeine.cache.RemovalListener; -import java.util.*; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.locks.ReentrantLock; -import java.util.stream.Collectors; -import org.apache.commons.lang.StringUtils; -import org.apache.storm.metric.api.MultiCountMetric; -import org.apache.storm.metric.api.MultiReducedMetric; -import org.apache.storm.task.OutputCollector; -import org.apache.storm.task.TopologyContext; -import org.apache.storm.tuple.Tuple; -import org.apache.stormcrawler.Metadata; -import org.apache.stormcrawler.elasticsearch.BulkItemResponseToFailedFlag; -import org.apache.stormcrawler.elasticsearch.ElasticSearchConnection; -import org.apache.stormcrawler.persistence.AbstractStatusUpdaterBolt; -import org.apache.stormcrawler.persistence.Status; -import org.apache.stormcrawler.util.ConfUtils; -import org.apache.stormcrawler.util.PerSecondReducer; -import org.apache.stormcrawler.util.URLPartitioner; -import org.elasticsearch.action.DocWriteRequest; -import org.elasticsearch.action.bulk.BulkItemResponse; -import org.elasticsearch.action.bulk.BulkProcessor; -import org.elasticsearch.action.bulk.BulkRequest; -import org.elasticsearch.action.bulk.BulkResponse; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.rest.RestStatus; -import org.elasticsearch.xcontent.XContentBuilder; -import org.elasticsearch.xcontent.XContentFactory; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Simple bolt which stores the status of URLs into ElasticSearch. Takes the tuples coming from the - * 'status' stream. To be used in combination with a Spout to read from the index. - */ -public class StatusUpdaterBolt extends AbstractStatusUpdaterBolt - implements RemovalListener>, BulkProcessor.Listener { - - private static final Logger LOG = LoggerFactory.getLogger(StatusUpdaterBolt.class); - - private String ESBoltType = "status"; - - private static final String ESStatusIndexNameParamName = "es.%s.index.name"; - private static final String ESStatusRoutingParamName = "es.%s.routing"; - private static final String ESStatusRoutingFieldParamName = "es.%s.routing.fieldname"; - - private boolean routingFieldNameInMetadata = false; - - private String indexName; - - private URLPartitioner partitioner; - - /** whether to apply the same partitioning logic used for politeness for routing, e.g byHost */ - private boolean doRouting; - - /** Store the key used for routing explicitly as a field in metadata * */ - private String fieldNameForRoutingKey = null; - - private ElasticSearchConnection connection; - - private Cache> waitAck; - - // Be fair due to cache timeout - private final ReentrantLock waitAckLock = new ReentrantLock(true); - - private MultiCountMetric eventCounter; - - private MultiReducedMetric receivedPerSecMetrics; - - public StatusUpdaterBolt() { - super(); - } - - /** - * Loads the configuration using a substring different from the default value 'status' in order - * to distinguish it from the spout configurations - */ - public StatusUpdaterBolt(String boltType) { - super(); - ESBoltType = boltType; - } - - @Override - public void prepare( - Map stormConf, TopologyContext context, OutputCollector collector) { - - super.prepare(stormConf, context, collector); - - indexName = - ConfUtils.getString( - stormConf, - String.format(StatusUpdaterBolt.ESStatusIndexNameParamName, ESBoltType), - "status"); - - doRouting = - ConfUtils.getBoolean( - stormConf, - String.format(StatusUpdaterBolt.ESStatusRoutingParamName, ESBoltType), - false); - - partitioner = new URLPartitioner(); - partitioner.configure(stormConf); - - fieldNameForRoutingKey = - ConfUtils.getString( - stormConf, - String.format(StatusUpdaterBolt.ESStatusRoutingFieldParamName, ESBoltType)); - if (StringUtils.isNotBlank(fieldNameForRoutingKey)) { - if (fieldNameForRoutingKey.startsWith("metadata.")) { - routingFieldNameInMetadata = true; - fieldNameForRoutingKey = fieldNameForRoutingKey.substring("metadata.".length()); - } - // periods are not allowed in ES2 - replace with %2E - fieldNameForRoutingKey = fieldNameForRoutingKey.replaceAll("\\.", "%2E"); - } - - waitAck = - Caffeine.newBuilder() - .expireAfterWrite(60, TimeUnit.SECONDS) - .removalListener(this) - .build(); - - int metrics_time_bucket_secs = 30; - - // create gauge for waitAck - context.registerMetric("waitAck", () -> waitAck.estimatedSize(), metrics_time_bucket_secs); - - // benchmarking - average number of items received back by Elastic per second - this.receivedPerSecMetrics = - context.registerMetric( - "average_persec", - new MultiReducedMetric(new PerSecondReducer()), - metrics_time_bucket_secs); - - this.eventCounter = - context.registerMetric( - "counters", new MultiCountMetric(), metrics_time_bucket_secs); - - try { - connection = ElasticSearchConnection.getConnection(stormConf, ESBoltType, this); - } catch (Exception e1) { - LOG.error("Can't connect to ElasticSearch", e1); - throw new RuntimeException(e1); - } - } - - @Override - public void cleanup() { - if (connection == null) { - return; - } - connection.close(); - connection = null; - } - - @Override - public void store( - String url, Status status, Metadata metadata, Optional nextFetch, Tuple tuple) - throws Exception { - - String documentID = getDocumentID(metadata, url); - - boolean isAlreadySentAndDiscovered; - // need to synchronize: otherwise it might get added to the cache - // without having been sent to ES - waitAckLock.lock(); - try { - // check that the same URL is not being sent to ES - final var alreadySent = waitAck.getIfPresent(documentID); - isAlreadySentAndDiscovered = status.equals(Status.DISCOVERED) && alreadySent != null; - } finally { - waitAckLock.unlock(); - } - - if (isAlreadySentAndDiscovered) { - // if this object is discovered - adding another version of it - // won't make any difference - LOG.debug( - "Already being sent to ES {} with status {} and ID {}", - url, - status, - documentID); - // ack straight away! - eventCounter.scope("skipped").incrBy(1); - super.ack(tuple, url); - return; - } - - XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); - builder.field("url", url); - builder.field("status", status); - - builder.startObject("metadata"); - for (String mdKey : metadata.keySet()) { - String[] values = metadata.getValues(mdKey); - // periods are not allowed in ES2 - replace with %2E - mdKey = mdKey.replaceAll("\\.", "%2E"); - builder.array(mdKey, values); - } - - String partitionKey = partitioner.getPartition(url, metadata); - if (partitionKey == null) { - partitionKey = "_DEFAULT_"; - } - - // store routing key in metadata? - if (StringUtils.isNotBlank(fieldNameForRoutingKey) && routingFieldNameInMetadata) { - builder.field(fieldNameForRoutingKey, partitionKey); - } - - builder.endObject(); - - // store routing key outside metadata? - if (StringUtils.isNotBlank(fieldNameForRoutingKey) && !routingFieldNameInMetadata) { - builder.field(fieldNameForRoutingKey, partitionKey); - } - - if (nextFetch.isPresent()) { - builder.timeField("nextFetchDate", nextFetch.get()); - } - - builder.endObject(); - - IndexRequest request = new IndexRequest(getIndexName(metadata)); - - // check that we don't overwrite an existing entry - // When create is used, the index operation will fail if a document - // by that id already exists in the index. - final boolean create = status.equals(Status.DISCOVERED); - request.source(builder).id(documentID).create(create); - - if (doRouting) { - request.routing(partitionKey); - } - - waitAckLock.lock(); - try { - final List tt = waitAck.get(documentID, k -> new LinkedList<>()); - tt.add(tuple); - LOG.debug("Added to waitAck {} with ID {} total {}", url, documentID, tt.size()); - } finally { - waitAckLock.unlock(); - } - - LOG.debug("Sending to ES buffer {} with ID {}", url, documentID); - - connection.addToProcessor(request); - } - - @Override - public void onRemoval( - @Nullable String key, @Nullable List value, @NotNull RemovalCause cause) { - if (!cause.wasEvicted()) return; - LOG.error("Purged from waitAck {} with {} values", key, value.size()); - for (Tuple t : value) { - eventCounter.scope("purged").incrBy(1); - _collector.fail(t); - } - } - - @Override - public void afterBulk(long executionId, BulkRequest request, BulkResponse response) { - LOG.debug("afterBulk [{}] with {} responses", executionId, request.numberOfActions()); - eventCounter.scope("bulks_received").incrBy(1); - eventCounter.scope("bulk_msec").incrBy(response.getTook().getMillis()); - eventCounter.scope("received").incrBy(request.numberOfActions()); - receivedPerSecMetrics.scope("received").update(request.numberOfActions()); - - var idsToBulkItemsWithFailedFlag = - Arrays.stream(response.getItems()) - .map( - bir -> { - String id = bir.getId(); - BulkItemResponse.Failure f = bir.getFailure(); - boolean failed = false; - if (f != null) { - // already discovered - if (f.getStatus().equals(RestStatus.CONFLICT)) { - eventCounter.scope("doc_conflicts").incrBy(1); - LOG.debug("Doc conflict ID {}", id); - } else { - LOG.error("Update ID {}, failure: {}", id, f); - failed = true; - } - } - return new BulkItemResponseToFailedFlag(bir, failed); - }) - .collect( - // https://github.com/DigitalPebble/storm-crawler/issues/832 - Collectors.groupingBy( - idWithFailedFlagTuple -> idWithFailedFlagTuple.id, - Collectors.toUnmodifiableList())); - - Map> presentTuples; - long estimatedSize; - Set debugInfo = null; - waitAckLock.lock(); - try { - presentTuples = waitAck.getAllPresent(idsToBulkItemsWithFailedFlag.keySet()); - if (!presentTuples.isEmpty()) { - waitAck.invalidateAll(presentTuples.keySet()); - } - estimatedSize = waitAck.estimatedSize(); - // Only if we have to. - if (LOG.isDebugEnabled() && estimatedSize > 0L) { - debugInfo = new HashSet<>(waitAck.asMap().keySet()); - } - } finally { - waitAckLock.unlock(); - } - - int ackCount = 0; - int failureCount = 0; - - for (var entry : presentTuples.entrySet()) { - final var id = entry.getKey(); - final var associatedTuple = entry.getValue(); - final var bulkItemsWithFailedFlag = idsToBulkItemsWithFailedFlag.get(id); - - BulkItemResponseToFailedFlag selected; - if (bulkItemsWithFailedFlag.size() == 1) { - selected = bulkItemsWithFailedFlag.get(0); - } else { - // Fallback if there are multiple responses for the same id - BulkItemResponseToFailedFlag tmp = null; - var ctFailed = 0; - for (var buwff : bulkItemsWithFailedFlag) { - if (tmp == null) { - tmp = buwff; - } - if (buwff.failed) ctFailed++; - else tmp = buwff; - } - if (ctFailed != bulkItemsWithFailedFlag.size()) { - LOG.warn( - "The id {} would result in an ack and a failure. Using only the ack for processing.", - id); - } - selected = Objects.requireNonNull(tmp); - } - - if (associatedTuple != null) { - LOG.debug("Acked {} tuple(s) for ID {}", associatedTuple.size(), id); - for (Tuple tuple : associatedTuple) { - if (!selected.failed) { - String url = tuple.getStringByField("url"); - ackCount++; - // ack and put in cache - LOG.debug("Acked {} with ID {}", url, id); - eventCounter.scope("acked").incrBy(1); - super.ack(tuple, url); - } else { - failureCount++; - eventCounter.scope("failed").incrBy(1); - _collector.fail(tuple); - } - } - } else { - LOG.warn("Could not find unacked tuple for {}", id); - } - } - - LOG.info( - "Bulk response [{}] : items {}, waitAck {}, acked {}, failed {}", - executionId, - idsToBulkItemsWithFailedFlag.size(), - estimatedSize, - ackCount, - failureCount); - if (debugInfo != null) { - for (String kinaw : debugInfo) { - LOG.debug("Still in wait ack after bulk response [{}] => {}", executionId, kinaw); - } - } - } - - @Override - public void afterBulk(long executionId, BulkRequest request, Throwable throwable) { - eventCounter.scope("bulks_received").incrBy(1); - eventCounter.scope("received").incrBy(request.numberOfActions()); - receivedPerSecMetrics.scope("received").update(request.numberOfActions()); - - LOG.error("Exception with bulk {} - failing the whole lot ", executionId, throwable); - - final var failedIds = - request.requests().stream() - .map(DocWriteRequest::id) - .collect(Collectors.toUnmodifiableSet()); - waitAckLock.lock(); - Map> failedTupleLists; - try { - failedTupleLists = waitAck.getAllPresent(failedIds); - if (!failedTupleLists.isEmpty()) { - waitAck.invalidateAll(failedTupleLists.keySet()); - } - } finally { - waitAckLock.unlock(); - } - - for (var id : failedIds) { - var failedTuples = failedTupleLists.get(id); - if (failedTuples != null) { - LOG.debug("Failed {} tuple(s) for ID {}", failedTuples.size(), id); - for (Tuple x : failedTuples) { - // fail it - eventCounter.scope("failed").incrBy(1); - _collector.fail(x); - } - } else { - LOG.warn("Could not find unacked tuple for {}", id); - } - } - } - - @Override - public void beforeBulk(long executionId, BulkRequest request) { - LOG.debug("beforeBulk {} with {} actions", executionId, request.numberOfActions()); - eventCounter.scope("bulks_sent").incrBy(1); - } - - /** - * Must be overridden for implementing custom index names based on some metadata information By - * Default, indexName coming from config is used - */ - protected String getIndexName(Metadata m) { - return indexName; - } -} diff --git a/external/elasticsearch/src/test/java/org/apache/stormcrawler/elasticsearch/bolt/IndexerBoltTest.java b/external/elasticsearch/src/test/java/org/apache/stormcrawler/elasticsearch/bolt/IndexerBoltTest.java deleted file mode 100644 index fb6278541..000000000 --- a/external/elasticsearch/src/test/java/org/apache/stormcrawler/elasticsearch/bolt/IndexerBoltTest.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.elasticsearch.bolt; - -import static org.junit.Assert.assertEquals; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.*; -import org.apache.storm.task.OutputCollector; -import org.apache.storm.tuple.Tuple; -import org.apache.stormcrawler.Constants; -import org.apache.stormcrawler.Metadata; -import org.apache.stormcrawler.TestOutputCollector; -import org.apache.stormcrawler.TestUtil; -import org.apache.stormcrawler.indexing.AbstractIndexerBolt; -import org.junit.*; -import org.junit.rules.Timeout; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.testcontainers.elasticsearch.ElasticsearchContainer; - -public class IndexerBoltTest { - - @Rule public Timeout globalTimeout = Timeout.seconds(120); - - private ElasticsearchContainer container; - private IndexerBolt bolt; - protected TestOutputCollector output; - - private static final Logger LOG = LoggerFactory.getLogger(IndexerBoltTest.class); - private static ExecutorService executorService; - - @BeforeClass - public static void beforeClass() { - executorService = Executors.newFixedThreadPool(2); - } - - @AfterClass - public static void afterClass() { - executorService.shutdown(); - executorService = null; - } - - @Before - public void setupIndexerBolt() { - - String version = System.getProperty("elasticsearch-version"); - if (version == null) version = "7.17.7"; - LOG.info("Starting docker instance of Elasticsearch {}...", version); - - container = - new ElasticsearchContainer( - "docker.elastic.co/elasticsearch/elasticsearch:" + version); - container.withPassword("s3cret"); - container.start(); - - bolt = new IndexerBolt("content"); - - // give the indexer the port for connecting to ES - - Map conf = new HashMap<>(); - conf.put(AbstractIndexerBolt.urlFieldParamName, "url"); - conf.put(AbstractIndexerBolt.canonicalMetadataParamName, "canonical"); - conf.put("es.indexer.addresses", container.getHttpHostAddress()); - conf.put("es.indexer.compatibility.mode", false); - conf.put("es.indexer.user", "elastic"); - conf.put("es.indexer.password", "s3cret"); - - output = new TestOutputCollector(); - - bolt.prepare(conf, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); - } - - @After - public void close() { - LOG.info("Closing indexer bolt and ES container"); - bolt.cleanup(); - container.close(); - output = null; - } - - private void index(String url, String text, Metadata metadata) { - Tuple tuple = mock(Tuple.class); - when(tuple.getStringByField("text")).thenReturn(text); - when(tuple.getStringByField("url")).thenReturn(url); - when(tuple.getValueByField("metadata")).thenReturn(metadata); - bolt.execute(tuple); - } - - private int lastIndex(String url, String text, Metadata metadata, long timeoutInMs) - throws ExecutionException, InterruptedException, TimeoutException { - var oldSize = output.getEmitted(Constants.StatusStreamName).size(); - index(url, text, metadata); - return executorService - .submit( - () -> { - // check that something has been emitted out - var outputSize = output.getEmitted(Constants.StatusStreamName).size(); - while (outputSize == oldSize) { - Thread.sleep(100); - outputSize = output.getEmitted(Constants.StatusStreamName).size(); - } - return outputSize; - }) - .get(timeoutInMs, TimeUnit.MILLISECONDS); - } - - @Test - // https://github.com/DigitalPebble/storm-crawler/issues/832 - public void simultaneousCanonicals() - throws ExecutionException, InterruptedException, TimeoutException { - Metadata m1 = new Metadata(); - String url = - "https://www.obozrevatel.com/ukr/dnipro/city/u-dnipri-ta-oblasti-ogolosili-shtormove-poperedzhennya.htm"; - m1.addValue("canonical", url); - - Metadata m2 = new Metadata(); - String url2 = - "https://www.obozrevatel.com/ukr/dnipro/city/u-dnipri-ta-oblasti-ogolosili-shtormove-poperedzhennya/amp.htm"; - m2.addValue("canonical", url); - - index(url, "", m1); - lastIndex(url2, "", m2, 10_000); - - // should be two in status output - assertEquals(2, output.getEmitted(Constants.StatusStreamName).size()); - - // and 2 acked - assertEquals(2, output.getAckedTuples().size()); - - // TODO check output in ES? - - } -} diff --git a/external/elasticsearch/src/test/java/org/apache/stormcrawler/elasticsearch/bolt/StatusBoltTest.java b/external/elasticsearch/src/test/java/org/apache/stormcrawler/elasticsearch/bolt/StatusBoltTest.java deleted file mode 100644 index a89e79710..000000000 --- a/external/elasticsearch/src/test/java/org/apache/stormcrawler/elasticsearch/bolt/StatusBoltTest.java +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to you under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.stormcrawler.elasticsearch.bolt; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -import java.io.IOException; -import java.net.URI; -import java.net.URISyntaxException; -import java.nio.charset.Charset; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.HashMap; -import java.util.Map; -import java.util.Objects; -import java.util.concurrent.*; -import org.apache.http.HttpHost; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.CredentialsProvider; -import org.apache.http.impl.client.BasicCredentialsProvider; -import org.apache.storm.task.OutputCollector; -import org.apache.storm.tuple.Tuple; -import org.apache.stormcrawler.Metadata; -import org.apache.stormcrawler.TestOutputCollector; -import org.apache.stormcrawler.TestUtil; -import org.apache.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt; -import org.apache.stormcrawler.persistence.Status; -import org.elasticsearch.action.get.GetRequest; -import org.elasticsearch.action.get.GetResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestClient; -import org.elasticsearch.client.RestClientBuilder; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.client.indices.CreateIndexRequest; -import org.elasticsearch.xcontent.XContentType; -import org.junit.*; -import org.junit.rules.Timeout; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.testcontainers.elasticsearch.ElasticsearchContainer; - -public class StatusBoltTest { - - private ElasticsearchContainer container; - private StatusUpdaterBolt bolt; - protected TestOutputCollector output; - - protected RestHighLevelClient client; - - private static final Logger LOG = LoggerFactory.getLogger(StatusBoltTest.class); - private static ExecutorService executorService; - - @Rule public Timeout globalTimeout = Timeout.seconds(120); - - @BeforeClass - public static void beforeClass() { - executorService = Executors.newFixedThreadPool(2); - } - - @AfterClass - public static void afterClass() { - executorService.shutdown(); - executorService = null; - } - - @Before - public void setupStatusBolt() throws IOException { - - String version = System.getProperty("elasticsearch-version"); - if (version == null) version = "7.17.7"; - LOG.info("Starting docker instance of Elasticsearch {}...", version); - - container = - new ElasticsearchContainer( - "docker.elastic.co/elasticsearch/elasticsearch:" + version) - .withPassword("s3cret"); - - container.start(); - - bolt = new StatusUpdaterBolt(); - - // configure the status index - - RestClientBuilder builder = - RestClient.builder( - new HttpHost(container.getHost(), container.getMappedPort(9200))); - - final CredentialsProvider credentialsProvider = new BasicCredentialsProvider(); - credentialsProvider.setCredentials( - AuthScope.ANY, new UsernamePasswordCredentials("elastic", "s3cret")); - - builder.setHttpClientConfigCallback( - clientBuilder -> { - clientBuilder.setDefaultCredentialsProvider(credentialsProvider); - return clientBuilder; - }); - - client = new RestHighLevelClient(builder); - - // TODO - // https://www.elastic.co/guide/en/elasticsearch/client/java-rest/7.13/java-rest-high-put-mapping.html - - CreateIndexRequest request = new CreateIndexRequest("status"); - - URI uriToFile; - try { - uriToFile = - Objects.requireNonNull( - getClass().getClassLoader().getResource("status.mapping")) - .toURI(); - } catch (URISyntaxException e) { - throw new RuntimeException(e); - } - - String mappingSource = Files.readString(Path.of(uriToFile), Charset.defaultCharset()); - - request.source(mappingSource, XContentType.JSON); - - client.indices().create(request, RequestOptions.DEFAULT); - - // configure the status updater bolt - - Map conf = new HashMap<>(); - conf.put("es.status.routing.fieldname", "metadata.key"); - - conf.put("es.status.addresses", container.getHttpHostAddress()); - - conf.put("scheduler.class", "org.apache.stormcrawler.persistence.DefaultScheduler"); - - conf.put("status.updater.cache.spec", "maximumSize=10000,expireAfterAccess=1h"); - - conf.put("metadata.persist", "someKey"); - - conf.put("es.status.compatibility.mode", false); - - conf.put("es.status.user", "elastic"); - conf.put("es.status.password", "s3cret"); - - output = new TestOutputCollector(); - - bolt.prepare(conf, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); - } - - @After - public void close() { - LOG.info("Closing updater bolt and ES container"); - bolt.cleanup(); - container.close(); - output = null; - try { - client.close(); - } catch (IOException e) { - } - } - - private Future store(String url, Status status, Metadata metadata) { - Tuple tuple = mock(Tuple.class); - when(tuple.getValueByField("status")).thenReturn(status); - when(tuple.getStringByField("url")).thenReturn(url); - when(tuple.getValueByField("metadata")).thenReturn(metadata); - bolt.execute(tuple); - - return executorService.submit( - () -> { - var outputSize = output.getAckedTuples().size(); - while (outputSize == 0) { - Thread.sleep(100); - outputSize = output.getAckedTuples().size(); - } - return outputSize; - }); - } - - @Test - // see https://github.com/DigitalPebble/storm-crawler/issues/885 - public void checkListKeyFromES() - throws IOException, ExecutionException, InterruptedException, TimeoutException { - - String url = "https://www.url.net/something"; - - Metadata md = new Metadata(); - - md.addValue("someKey", "someValue"); - - store(url, Status.DISCOVERED, md).get(10, TimeUnit.SECONDS); - - assertEquals(1, output.getAckedTuples().size()); - - // check output in ES? - - String id = org.apache.commons.codec.digest.DigestUtils.sha256Hex(url); - - GetResponse result = client.get(new GetRequest("status", id), RequestOptions.DEFAULT); - - Map sourceAsMap = result.getSourceAsMap(); - - final String pfield = "metadata.someKey"; - sourceAsMap = (Map) sourceAsMap.get("metadata"); - - final var pfieldNew = pfield.substring(9); - Object key = sourceAsMap.get(pfieldNew); - - assertTrue(key instanceof java.util.ArrayList); - } -} diff --git a/external/elasticsearch/src/test/resources/status.mapping b/external/elasticsearch/src/test/resources/status.mapping deleted file mode 100644 index e5b14fe97..000000000 --- a/external/elasticsearch/src/test/resources/status.mapping +++ /dev/null @@ -1,39 +0,0 @@ -{ - "settings": { - "index": { - "number_of_shards": 10, - "number_of_replicas": 1, - "refresh_interval": "5s" - } - }, - "mappings": { - "dynamic_templates": [{ - "metadata": { - "path_match": "metadata.*", - "match_mapping_type": "string", - "mapping": { - "type": "keyword" - } - } - }], - "_source": { - "enabled": true - }, - "properties": { - "key": { - "type": "keyword", - "index": true - }, - "nextFetchDate": { - "type": "date", - "format": "date_optional_time" - }, - "status": { - "type": "keyword" - }, - "url": { - "type": "keyword" - } - } - } -} diff --git a/pom.xml b/pom.xml index 28d93538e..7760a709c 100644 --- a/pom.xml +++ b/pom.xml @@ -479,7 +479,6 @@ under the License. core external external/aws - external/elasticsearch external/langid external/opensearch external/solr @@ -488,7 +487,6 @@ under the License. external/urlfrontier external/warc archetype - external/elasticsearch/archetype external/opensearch/archetype