aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/buxton/scraper.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/scraping/buxton/scraper.py')
-rw-r--r--src/scraping/buxton/scraper.py40
1 files changed, 30 insertions, 10 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py
index 700269727..48b8fe3fa 100644
--- a/src/scraping/buxton/scraper.py
+++ b/src/scraping/buxton/scraper.py
@@ -15,7 +15,9 @@ source = "./source"
dist = "../../server/public/files"
db = MongoClient("localhost", 27017)["Dash"]
+target_collection = db.newDocuments
schema_guids = []
+common_proto_id = ""
def extract_links(fileName):
@@ -84,7 +86,7 @@ def write_schema(parse_results, display_fields, storage_key):
"height": 600,
"panX": 0,
"panY": 0,
- "zoomBasis": 0.5,
+ "zoomBasis": 1,
"zIndex": 2,
"libraryBrush": False,
"viewType": 2
@@ -92,7 +94,7 @@ def write_schema(parse_results, display_fields, storage_key):
"__type": "Doc"
}
- fields["proto"] = protofy("collectionProto")
+ fields["proto"] = protofy(common_proto_id)
fields[storage_key] = listify(proxify_guids(view_guids))
fields["schemaColumns"] = listify(display_fields)
fields["backgroundColor"] = "white"
@@ -106,8 +108,8 @@ def write_schema(parse_results, display_fields, storage_key):
fields["isPrototype"] = True
fields["page"] = -1
- db.newDocuments.insert_one(data_doc)
- db.newDocuments.insert_one(view_doc)
+ target_collection.insert_one(data_doc)
+ target_collection.insert_one(view_doc)
data_doc_guid = data_doc["_id"]
print(f"inserted view document ({view_doc_guid})")
@@ -136,7 +138,7 @@ def write_text_doc(content):
data_doc = {
"_id": data_doc_guid,
"fields": {
- "proto": protofy("textProto"),
+ "proto": protofy("commonImportProto"),
"data": {
"Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}',
"__type": "RichTextField"
@@ -158,8 +160,8 @@ def write_text_doc(content):
"__type": "Doc"
}
- db.newDocuments.insert_one(view_doc)
- db.newDocuments.insert_one(data_doc)
+ target_collection.insert_one(view_doc)
+ target_collection.insert_one(data_doc)
return view_doc_guid
@@ -209,8 +211,8 @@ def write_image(folder, name):
"__type": "Doc"
}
- db.newDocuments.insert_one(view_doc)
- db.newDocuments.insert_one(data_doc)
+ target_collection.insert_one(view_doc)
+ target_collection.insert_one(data_doc)
return view_doc_guid
@@ -347,6 +349,22 @@ def proxify_guids(guids):
return list(map(lambda guid: {"fieldId": guid, "__type": "proxy"}, guids))
+def write_common_proto():
+ id = guid()
+ common_proto = {
+ "_id": id,
+ "fields": {
+ "proto": protofy("collectionProto"),
+ "title": "Common Import Proto",
+ },
+ "__type": "Doc"
+ }
+
+ target_collection.insert_one(common_proto)
+
+ return id
+
+
if os.path.exists(dist):
shutil.rmtree(dist)
while os.path.exists(dist):
@@ -354,6 +372,8 @@ while os.path.exists(dist):
os.mkdir(dist)
mkdir_if_absent(source)
+common_proto_id = write_common_proto()
+
candidates = 0
for file_name in os.listdir(source):
if file_name.endswith('.docx'):
@@ -372,7 +392,7 @@ parent_guid = write_schema({
}, ["title", "short_description", "original_price"], "data")
print("appending parent schema to main workspace...\n")
-db.newDocuments.update_one(
+target_collection.update_one(
{"fields.title": "WS collection 1"},
{"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}}
)