diff options
| author | andrewdkim <adkim414@gmail.com> | 2019-08-06 12:30:09 -0400 | 
|---|---|---|
| committer | andrewdkim <adkim414@gmail.com> | 2019-08-06 12:30:09 -0400 | 
| commit | b6990a61befdea70abd99f125a2488ce5a6f04a6 (patch) | |
| tree | 833c13a0ddabb325cc2e39dbb199f111cced22d2 /solr-8.1.1/example/example-DIH/solr/tika | |
| parent | 2c86a6958186c020ce7fbe99555f07ffe9f9f821 (diff) | |
| parent | 298d1c9b29d6ce2171fd9ac8274b64583b73f6f5 (diff) | |
merge from master
Diffstat (limited to 'solr-8.1.1/example/example-DIH/solr/tika')
4 files changed, 142 insertions, 0 deletions
| diff --git a/solr-8.1.1/example/example-DIH/solr/tika/conf/managed-schema b/solr-8.1.1/example/example-DIH/solr/tika/conf/managed-schema new file mode 100644 index 000000000..b90f314ff --- /dev/null +++ b/solr-8.1.1/example/example-DIH/solr/tika/conf/managed-schema @@ -0,0 +1,54 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements.  See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License.  You may obtain a copy of the License at + +     http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<schema name="example-DIH-tika" version="1.6"> + +  <uniqueKey>id</uniqueKey> + +  <field name="id" type="string" indexed="true" stored="true"/> +  <field name="author" type="text_simple" indexed="true" stored="true"/> +  <field name="title" type="text_simple" indexed="true" stored="true" multiValued="true"/> +  <field name="format" type="string" indexed="true" stored="true"/> + +  <!-- field "text" is searchable but it is not stored to save space --> +  <field name="text" type="text_simple" indexed="true" stored="false" multiValued="true"/> + + +  <!-- Uncomment the dynamicField definition to catch any other fields +   that may have been declared in the DIH configuration. +   This allows to speed up prototyping. +  --> +  <!-- <dynamicField name="*" type="string" indexed="true" stored="true" multiValued="true"/> --> + +  <!-- The StrField type is not analyzed, but is indexed/stored verbatim. --> +  <fieldType name="string" class="solr.StrField" sortMissingLast="true"/> + + +  <!-- A basic text field that has reasonable, generic +   cross-language defaults: it tokenizes with StandardTokenizer, +   and down cases. It does not deal with stopwords or other issues. +   See other examples for alternative definitions. +  --> +  <fieldType name="text_simple" class="solr.TextField" positionIncrementGap="100"> +    <analyzer> +      <tokenizer class="solr.StandardTokenizerFactory"/> +      <filter class="solr.LowerCaseFilterFactory"/> +    </analyzer> +  </fieldType> + +</schema>
\ No newline at end of file diff --git a/solr-8.1.1/example/example-DIH/solr/tika/conf/solrconfig.xml b/solr-8.1.1/example/example-DIH/solr/tika/conf/solrconfig.xml new file mode 100644 index 000000000..d8509f863 --- /dev/null +++ b/solr-8.1.1/example/example-DIH/solr/tika/conf/solrconfig.xml @@ -0,0 +1,61 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements.  See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License.  You may obtain a copy of the License at + +     http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<!-- + This is a DEMO configuration highlighting elements + specifically needed to get this example running + such as libraries and request handler specifics. + + It uses defaults or does not define most of production-level settings + such as various caches or auto-commit policies. + + See Solr Reference Guide and other examples for + more details on a well configured solrconfig.xml + https://lucene.apache.org/solr/guide/the-well-configured-solr-instance.html +--> + +<config> +  <!-- Controls what version of Lucene various components of Solr +   adhere to.  Generally, you want to use the latest version to +   get all bug fixes and improvements. It is highly recommended +   that you fully re-index after changing this setting as it can +   affect both how text is indexed and queried. +  --> +  <luceneMatchVersion>8.1.1</luceneMatchVersion> + +  <!-- Load Data Import Handler and Apache Tika (extraction) libraries --> +  <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*\.jar"/> +  <lib dir="${solr.install.dir:../../../..}/contrib/extraction/lib" regex=".*\.jar"/> + +  <requestHandler name="/select" class="solr.SearchHandler"> +    <lst name="defaults"> +      <str name="echoParams">explicit</str> +      <str name="df">text</str> +       <!-- Change from JSON to XML format (the default prior to Solr 7.0) +          <str name="wt">xml</str>  +         --> +    </lst> +  </requestHandler> + +  <requestHandler name="/dataimport" class="solr.DataImportHandler"> +    <lst name="defaults"> +      <str name="config">tika-data-config.xml</str> +    </lst> +  </requestHandler> + +</config> diff --git a/solr-8.1.1/example/example-DIH/solr/tika/conf/tika-data-config.xml b/solr-8.1.1/example/example-DIH/solr/tika/conf/tika-data-config.xml new file mode 100644 index 000000000..5286fc418 --- /dev/null +++ b/solr-8.1.1/example/example-DIH/solr/tika/conf/tika-data-config.xml @@ -0,0 +1,26 @@ +<dataConfig> +  <dataSource type="BinFileDataSource"/> +  <document> +    <entity name="file" processor="FileListEntityProcessor" dataSource="null" +            baseDir="${solr.install.dir}/example/exampledocs" fileName=".*pdf" +            rootEntity="false"> + +      <field column="file" name="id"/> + +      <entity name="pdf" processor="TikaEntityProcessor" +              url="${file.fileAbsolutePath}" format="text"> + +        <field column="Author" name="author" meta="true"/> +        <!-- in the original PDF, the Author meta-field name is upper-cased, +          but in Solr schema it is lower-cased +         --> + +        <field column="title" name="title" meta="true"/> +        <field column="dc:format" name="format" meta="true"/> + +        <field column="text" name="text"/> + +      </entity> +    </entity> +  </document> +</dataConfig> diff --git a/solr-8.1.1/example/example-DIH/solr/tika/core.properties b/solr-8.1.1/example/example-DIH/solr/tika/core.properties new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/solr-8.1.1/example/example-DIH/solr/tika/core.properties @@ -0,0 +1 @@ + | 
