diff options
| author | kimdahey <claire_kim1@brown.edu> | 2020-01-16 11:31:41 -0500 |
|---|---|---|
| committer | kimdahey <claire_kim1@brown.edu> | 2020-01-16 11:31:41 -0500 |
| commit | 6be0e19ed0bd13f3796f542affa5a2e52674650c (patch) | |
| tree | 1be222ea9341ecd8020fad3149035fa650a8a07f /solr-8.1.1/example/films/film_data_generator.py | |
| parent | 5cde81d8c6b4dcd8d0796f8669b668763957f395 (diff) | |
| parent | e410cde0e430553002d4e1a2f64364b57b65fdbc (diff) | |
merged w master
Diffstat (limited to 'solr-8.1.1/example/films/film_data_generator.py')
| -rw-r--r-- | solr-8.1.1/example/films/film_data_generator.py | 117 |
1 files changed, 0 insertions, 117 deletions
diff --git a/solr-8.1.1/example/films/film_data_generator.py b/solr-8.1.1/example/films/film_data_generator.py deleted file mode 100644 index 7e2a46318..000000000 --- a/solr-8.1.1/example/films/film_data_generator.py +++ /dev/null @@ -1,117 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This will generate a movie data set of 1100 records. -These are the first 1100 movies which appear when querying the Freebase of type '/film/film'. -Here is the link to the freebase page - https://www.freebase.com/film/film?schema= - -Usage - python3 film_data_generator.py -""" - -import csv -import copy -import json -import codecs -import datetime -import urllib.parse -import urllib.request -import xml.etree.cElementTree as ET -from xml.dom import minidom - -MAX_ITERATIONS=10 #10 limits it to 1100 docs - -# You need an API Key by Google to run this -API_KEY = '<insert your Google developer API key>' -service_url = 'https://www.googleapis.com/freebase/v1/mqlread' -query = [{ - "id": None, - "name": None, - "initial_release_date": None, - "directed_by": [], - "genre": [], - "type": "/film/film", - "initial_release_date>" : "2000" -}] - -def gen_csv(filmlist): - filmlistDup = copy.deepcopy(filmlist) - #Convert multi-valued to % delimited string - for film in filmlistDup: - for key in film: - if isinstance(film[key], list): - film[key] = '|'.join(film[key]) - keys = ['name', 'directed_by', 'genre', 'type', 'id', 'initial_release_date'] - with open('films.csv', 'w', newline='', encoding='utf8') as csvfile: - dict_writer = csv.DictWriter(csvfile, keys) - dict_writer.writeheader() - dict_writer.writerows(filmlistDup) - -def gen_json(filmlist): - filmlistDup = copy.deepcopy(filmlist) - with open('films.json', 'w') as jsonfile: - jsonfile.write(json.dumps(filmlist, indent=2)) - -def gen_xml(filmlist): - root = ET.Element("add") - for film in filmlist: - doc = ET.SubElement(root, "doc") - for key in film: - if isinstance(film[key], list): - for value in film[key]: - field = ET.SubElement(doc, "field") - field.set("name", key) - field.text=value - else: - field = ET.SubElement(doc, "field") - field.set("name", key) - field.text=film[key] - tree = ET.ElementTree(root) - with open('films.xml', 'w') as f: - f.write( minidom.parseString(ET.tostring(tree.getroot(),'utf-8')).toprettyxml(indent=" ") ) - -def do_query(filmlist, cursor=""): - params = { - 'query': json.dumps(query), - 'key': API_KEY, - 'cursor': cursor - } - url = service_url + '?' + urllib.parse.urlencode(params) - data = urllib.request.urlopen(url).read().decode('utf-8') - response = json.loads(data) - for item in response['result']: - del item['type'] # It's always /film/film. No point of adding this. - try: - datetime.datetime.strptime(item['initial_release_date'], "%Y-%m-%d") - except ValueError: - #Date time not formatted properly. Keeping it simple by removing the date field from that doc - del item['initial_release_date'] - filmlist.append(item) - return response.get("cursor") - - -if __name__ == "__main__": - filmlist = [] - cursor = do_query(filmlist) - i=0 - while(cursor): - cursor = do_query(filmlist, cursor) - i = i+1 - if i==MAX_ITERATIONS: - break - - gen_json(filmlist) - gen_csv(filmlist) - gen_xml(filmlist) |
