diff options
Diffstat (limited to 'src/scraping/buxton/scraper.py')
| -rw-r--r-- | src/scraping/buxton/scraper.py | 40 | 
1 files changed, 30 insertions, 10 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 700269727..48b8fe3fa 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -15,7 +15,9 @@ source = "./source"  dist = "../../server/public/files"  db = MongoClient("localhost", 27017)["Dash"] +target_collection = db.newDocuments  schema_guids = [] +common_proto_id = ""  def extract_links(fileName): @@ -84,7 +86,7 @@ def write_schema(parse_results, display_fields, storage_key):              "height": 600,              "panX": 0,              "panY": 0, -            "zoomBasis": 0.5, +            "zoomBasis": 1,              "zIndex": 2,              "libraryBrush": False,              "viewType": 2 @@ -92,7 +94,7 @@ def write_schema(parse_results, display_fields, storage_key):          "__type": "Doc"      } -    fields["proto"] = protofy("collectionProto") +    fields["proto"] = protofy(common_proto_id)      fields[storage_key] = listify(proxify_guids(view_guids))      fields["schemaColumns"] = listify(display_fields)      fields["backgroundColor"] = "white" @@ -106,8 +108,8 @@ def write_schema(parse_results, display_fields, storage_key):      fields["isPrototype"] = True      fields["page"] = -1 -    db.newDocuments.insert_one(data_doc) -    db.newDocuments.insert_one(view_doc) +    target_collection.insert_one(data_doc) +    target_collection.insert_one(view_doc)      data_doc_guid = data_doc["_id"]      print(f"inserted view document ({view_doc_guid})") @@ -136,7 +138,7 @@ def write_text_doc(content):      data_doc = {          "_id": data_doc_guid,          "fields": { -            "proto": protofy("textProto"), +            "proto": protofy("commonImportProto"),              "data": {                  "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}',                  "__type": "RichTextField" @@ -158,8 +160,8 @@ def write_text_doc(content):          "__type": "Doc"      } -    db.newDocuments.insert_one(view_doc) -    db.newDocuments.insert_one(data_doc) +    target_collection.insert_one(view_doc) +    target_collection.insert_one(data_doc)      return view_doc_guid @@ -209,8 +211,8 @@ def write_image(folder, name):          "__type": "Doc"      } -    db.newDocuments.insert_one(view_doc) -    db.newDocuments.insert_one(data_doc) +    target_collection.insert_one(view_doc) +    target_collection.insert_one(data_doc)      return view_doc_guid @@ -347,6 +349,22 @@ def proxify_guids(guids):      return list(map(lambda guid: {"fieldId": guid, "__type": "proxy"}, guids)) +def write_common_proto(): +    id = guid() +    common_proto = { +        "_id": id, +        "fields": { +            "proto": protofy("collectionProto"), +            "title": "Common Import Proto", +        }, +        "__type": "Doc" +    } + +    target_collection.insert_one(common_proto) + +    return id + +  if os.path.exists(dist):      shutil.rmtree(dist)  while os.path.exists(dist): @@ -354,6 +372,8 @@ while os.path.exists(dist):  os.mkdir(dist)  mkdir_if_absent(source) +common_proto_id = write_common_proto() +  candidates = 0  for file_name in os.listdir(source):      if file_name.endswith('.docx'): @@ -372,7 +392,7 @@ parent_guid = write_schema({  }, ["title", "short_description", "original_price"], "data")  print("appending parent schema to main workspace...\n") -db.newDocuments.update_one( +target_collection.update_one(      {"fields.title": "WS collection 1"},      {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}}  )  | 
