diff options
| author | kimdahey <claire_kim1@brown.edu> | 2020-01-16 11:31:41 -0500 |
|---|---|---|
| committer | kimdahey <claire_kim1@brown.edu> | 2020-01-16 11:31:41 -0500 |
| commit | 6be0e19ed0bd13f3796f542affa5a2e52674650c (patch) | |
| tree | 1be222ea9341ecd8020fad3149035fa650a8a07f /solr-8.3.1/example/films/film_data_generator.py | |
| parent | 5cde81d8c6b4dcd8d0796f8669b668763957f395 (diff) | |
| parent | e410cde0e430553002d4e1a2f64364b57b65fdbc (diff) | |
merged w master
Diffstat (limited to 'solr-8.3.1/example/films/film_data_generator.py')
| -rw-r--r-- | solr-8.3.1/example/films/film_data_generator.py | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/solr-8.3.1/example/films/film_data_generator.py b/solr-8.3.1/example/films/film_data_generator.py new file mode 100644 index 000000000..7e2a46318 --- /dev/null +++ b/solr-8.3.1/example/films/film_data_generator.py @@ -0,0 +1,117 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This will generate a movie data set of 1100 records. +These are the first 1100 movies which appear when querying the Freebase of type '/film/film'. +Here is the link to the freebase page - https://www.freebase.com/film/film?schema= + +Usage - python3 film_data_generator.py +""" + +import csv +import copy +import json +import codecs +import datetime +import urllib.parse +import urllib.request +import xml.etree.cElementTree as ET +from xml.dom import minidom + +MAX_ITERATIONS=10 #10 limits it to 1100 docs + +# You need an API Key by Google to run this +API_KEY = '<insert your Google developer API key>' +service_url = 'https://www.googleapis.com/freebase/v1/mqlread' +query = [{ + "id": None, + "name": None, + "initial_release_date": None, + "directed_by": [], + "genre": [], + "type": "/film/film", + "initial_release_date>" : "2000" +}] + +def gen_csv(filmlist): + filmlistDup = copy.deepcopy(filmlist) + #Convert multi-valued to % delimited string + for film in filmlistDup: + for key in film: + if isinstance(film[key], list): + film[key] = '|'.join(film[key]) + keys = ['name', 'directed_by', 'genre', 'type', 'id', 'initial_release_date'] + with open('films.csv', 'w', newline='', encoding='utf8') as csvfile: + dict_writer = csv.DictWriter(csvfile, keys) + dict_writer.writeheader() + dict_writer.writerows(filmlistDup) + +def gen_json(filmlist): + filmlistDup = copy.deepcopy(filmlist) + with open('films.json', 'w') as jsonfile: + jsonfile.write(json.dumps(filmlist, indent=2)) + +def gen_xml(filmlist): + root = ET.Element("add") + for film in filmlist: + doc = ET.SubElement(root, "doc") + for key in film: + if isinstance(film[key], list): + for value in film[key]: + field = ET.SubElement(doc, "field") + field.set("name", key) + field.text=value + else: + field = ET.SubElement(doc, "field") + field.set("name", key) + field.text=film[key] + tree = ET.ElementTree(root) + with open('films.xml', 'w') as f: + f.write( minidom.parseString(ET.tostring(tree.getroot(),'utf-8')).toprettyxml(indent=" ") ) + +def do_query(filmlist, cursor=""): + params = { + 'query': json.dumps(query), + 'key': API_KEY, + 'cursor': cursor + } + url = service_url + '?' + urllib.parse.urlencode(params) + data = urllib.request.urlopen(url).read().decode('utf-8') + response = json.loads(data) + for item in response['result']: + del item['type'] # It's always /film/film. No point of adding this. + try: + datetime.datetime.strptime(item['initial_release_date'], "%Y-%m-%d") + except ValueError: + #Date time not formatted properly. Keeping it simple by removing the date field from that doc + del item['initial_release_date'] + filmlist.append(item) + return response.get("cursor") + + +if __name__ == "__main__": + filmlist = [] + cursor = do_query(filmlist) + i=0 + while(cursor): + cursor = do_query(filmlist, cursor) + i = i+1 + if i==MAX_ITERATIONS: + break + + gen_json(filmlist) + gen_csv(filmlist) + gen_xml(filmlist) |
