diff options
author | bob <bcz@cs.brown.edu> | 2019-08-08 11:21:57 -0400 |
---|---|---|
committer | bob <bcz@cs.brown.edu> | 2019-08-08 11:21:57 -0400 |
commit | e946f85b46e1b1a75ba46ccf9a1ee023e749b837 (patch) | |
tree | 13e6e489d10b7707a9e10fde87f399e396d506cd /src/scraping/buxton/scraper.py | |
parent | 3110d85b7e1efab006a13824792b031f63dba8c8 (diff) | |
parent | 9776f0dbb1189105e5b947beb107203f4404c40c (diff) |
Merge branch 'master' of https://github.com/browngraphicslab/Dash-Web
Diffstat (limited to 'src/scraping/buxton/scraper.py')
-rw-r--r-- | src/scraping/buxton/scraper.py | 18 |
1 files changed, 10 insertions, 8 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 29cb8a256..807216ef1 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -17,6 +17,7 @@ dist = "../../server/public/files" db = MongoClient("localhost", 27017)["Dash"] target_collection = db.newDocuments +target_doc_title = "Workspace 1" schema_guids = [] common_proto_id = "" @@ -69,7 +70,7 @@ def text_doc_map(string_list): return listify(proxify_guids(list(map(guid_map, string_list)))) -def write_schema(parse_results, display_fields, storage_key): +def write_collection(parse_results, display_fields, storage_key, viewType=2): view_guids = parse_results["child_guids"] data_doc = parse_results["schema"] @@ -88,8 +89,9 @@ def write_schema(parse_results, display_fields, storage_key): "panX": 0, "panY": 0, "zoomBasis": 1, - "zIndex": 2 - "viewType": 2 + "zIndex": 2, + "libraryBrush": False, + "viewType": viewType }, "__type": "Doc" } @@ -234,7 +236,7 @@ def parse_document(file_name: str): copyfile(dir_path + "/" + image, dir_path + "/" + image.replace(".", "_o.", 1)) copyfile(dir_path + "/" + image, dir_path + - "/" + image.replace(".", "_m.", 1)) + "/" + image.replace(".", "_m.", 1)) print(f"extracted {count} images...") def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( @@ -378,22 +380,22 @@ candidates = 0 for file_name in os.listdir(source): if file_name.endswith('.docx'): candidates += 1 - schema_guids.append(write_schema( + schema_guids.append(write_collection( parse_document(file_name), ["title", "data"], "image_data")) print("writing parent schema...") -parent_guid = write_schema({ +parent_guid = write_collection({ "schema": { "_id": guid(), "fields": {}, "__type": "Doc" }, "child_guids": schema_guids -}, ["title", "short_description", "original_price"], "data") +}, ["title", "short_description", "original_price"], "data", 1) print("appending parent schema to main workspace...\n") target_collection.update_one( - {"fields.title": "WS collection 1"}, + {"fields.title": target_doc_title}, {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}} ) |