scraping

author: Sam Wilkins <samuel_wilkins@brown.edu> 2019-06-26 14:55:38 -0400
committer: Sam Wilkins <samuel_wilkins@brown.edu> 2019-06-26 14:55:38 -0400
commit: d564601da06b696f59b97bf162fa52354d49f8c9 (patch)
tree: 1573dbab01fa2127ef0971b4f1c25f4c6fa39330 /src/scraping/buxton/scraper.py
parent: 7b38962bf658e998c33cca0760eeba4a4945332a (diff)
1 files changed, 331 insertions, 0 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py
new file mode 100644
index 000000000..97af10519
--- /dev/null
+++ b/src/scraping/buxton/scraper.py
@@ -0,0 +1,331 @@
+import os
+import docx2txt
+from docx import Document
+from docx.opc.constants import RELATIONSHIP_TYPE as RT
+import re
+from pymongo import MongoClient
+import shutil
+import uuid
+import datetime
+from PIL import Image
+import math
+import sys
+
+source = "./source"
+dist = "../../server/public/files"
+
+db = MongoClient("localhost", 27017)["Dash"]
+schema_guids = []
+
+
+def extract_links(fileName):
+    links = []
+    doc = Document(fileName)
+    rels = doc.part.rels
+    for rel in rels:
+        item = rels[rel]
+        if item.reltype == RT.HYPERLINK and ".aspx" not in item._target:
+            links.append(item._target)
+    return listify(links)
+
+
+def extract_value(kv_string):
+    pieces = kv_string.split(":")
+    return (pieces[1] if len(pieces) > 1 else kv_string).strip()
+
+
+def mkdir_if_absent(path):
+    try:
+        if not os.path.exists(path):
+            os.mkdir(path)
+    except OSError:
+        print("failed to create the appropriate directory structures for %s" % file_name)
+
+
+def guid():
+    return str(uuid.uuid4())
+
+
+def listify(list):
+    return {
+        "fields": list,
+        "__type": "list"
+    }
+
+
+def protofy(fieldId):
+    return {
+        "fieldId": fieldId,
+        "__type": "proxy"
+    }
+
+
+def write_schema(parse_results, display_fields):
+    view_guids = parse_results["child_guids"]
+
+    data_doc = parse_results["schema"]
+    fields = data_doc["fields"]
+
+    view_doc_guid = guid()
+
+    view_doc = {
+        "_id": view_doc_guid,
+        "fields": {
+            "proto": protofy(data_doc["_id"]),
+            "x": 10,
+            "y": 10,
+            "width": 900,
+            "height": 600,
+            "panX": 0,
+            "panY": 0,
+            "zoomBasis": 0.5,
+            "zIndex": 2,
+            "libraryBrush": False,
+            "viewType": 2
+        },
+        "__type": "Doc"
+    }
+
+    fields["proto"] = protofy("collectionProto")
+    fields["data"] = listify(proxify_guids(view_guids))
+    fields["schemaColumns"] = listify(display_fields)
+    fields["backgroundColor"] = "white"
+    fields["scale"] = 0.5
+    fields["viewType"] = 2
+    fields["author"] = "Bill Buxton"
+    fields["creationDate"] = {
+        "date": datetime.datetime.utcnow().microsecond,
+        "__type": "date"
+    }
+    fields["isPrototype"] = True
+    fields["page"] = -1
+
+    db.newDocuments.insert_one(data_doc)
+    db.newDocuments.insert_one(view_doc)
+
+    data_doc_guid = data_doc["_id"]
+    print(f"inserted view document ({view_doc_guid})")
+    print(f"inserted data document ({data_doc_guid})\n")
+
+    return view_doc_guid
+
+
+def write_image(folder, name):
+    path = f"http://localhost:1050/files/{folder}/{name}"
+
+    data_doc_guid = guid()
+    view_doc_guid = guid()
+
+    view_doc = {
+        "_id": view_doc_guid,
+        "fields": {
+            "proto": protofy(data_doc_guid),
+            "x": 10,
+            "y": 10,
+            "width": 300,
+            "zIndex": 2,
+            "libraryBrush": False
+        },
+        "__type": "Doc"
+    }
+
+    image = Image.open(f"{dist}/{folder}/{name}")
+    native_width, native_height = image.size
+
+    data_doc = {
+        "_id": data_doc_guid,
+        "fields": {
+            "proto": protofy("imageProto"),
+            "data": {
+                "url": path,
+                "__type": "image"
+            },
+            "title": name,
+            "nativeWidth": native_width,
+            "author": "Bill Buxton",
+            "creationDate": {
+                "date": datetime.datetime.utcnow().microsecond,
+                "__type": "date"
+            },
+            "isPrototype": True,
+            "page": -1,
+            "nativeHeight": native_height,
+            "height": native_height
+        },
+        "__type": "Doc"
+    }
+
+    db.newDocuments.insert_one(view_doc)
+    db.newDocuments.insert_one(data_doc)
+
+    return view_doc_guid
+
+
+def parse_document(file_name: str):
+    print(f"parsing {file_name}...")
+    pure_name = file_name.split(".")[0]
+
+    result = {}
+
+    dir_path = dist + "/" + pure_name
+    mkdir_if_absent(dir_path)
+
+    raw = str(docx2txt.process(source + "/" + file_name, dir_path))
+
+    view_guids = []
+    count = 0
+    for image in os.listdir(dir_path):
+        count += 1
+        view_guids.append(write_image(pure_name, image))
+        os.rename(dir_path + "/" + image, dir_path +
+                  "/" + image.replace(".", "_m.", 1))
+    print(f"extracted {count} images...")
+
+    def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace(
+        u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip()
+
+    def sanitize_price(raw: str):
+        raw = raw.replace(",", "")
+        start = raw.find("$")
+        if start > -1:
+            i = start + 1
+            while (i < len(raw) and re.match(r"[0-9\.]", raw[i])):
+                i += 1
+            price = raw[start + 1: i + 1]
+            return float(price)
+        elif (raw.lower().find("nfs")):
+            return -1
+        else:
+            return math.nan
+
+    def remove_empty(line): return len(line) > 1
+
+    lines = list(map(sanitize, raw.split("\n")))
+    lines = list(filter(remove_empty, lines))
+
+    result["file_name"] = file_name
+    result["title"] = lines[2].strip()
+    result["short_description"] = lines[3].strip().replace(
+        "Short Description: ", "")
+
+    cur = 5
+    notes = ""
+    while lines[cur] != "Device Details":
+        notes += lines[cur] + " "
+        cur += 1
+    result["buxton_notes"] = notes.strip()
+
+    cur += 1
+    clean = list(
+        map(lambda data: data.strip().split(":"), lines[cur].split("|")))
+    result["company"] = clean[0][len(clean[0]) - 1].strip()
+    result["year"] = clean[1][len(clean[1]) - 1].strip()
+    result["original_price"] = sanitize_price(
+        clean[2][len(clean[2]) - 1].strip())
+
+    cur += 1
+    result["degrees_of_freedom"] = extract_value(
+        lines[cur]).replace("NA", "N/A")
+    cur += 1
+
+    dimensions = lines[cur].lower()
+    if dimensions.startswith("dimensions"):
+        dim_concat = dimensions[11:].strip()
+        cur += 1
+        while lines[cur] != "Key Words":
+            dim_concat += (" " + lines[cur].strip())
+            cur += 1
+        result["dimensions"] = dim_concat
+    else:
+        result["dimensions"] = "N/A"
+
+    cur += 1
+    result["primary_key"] = extract_value(lines[cur])
+    cur += 1
+    result["secondary_key"] = extract_value(lines[cur])
+
+    while lines[cur] != "Links":
+        result["secondary_key"] += (" " + extract_value(lines[cur]).strip())
+        cur += 1
+
+    cur += 1
+    link_descriptions = []
+    while lines[cur] != "Image":
+        link_descriptions.append(lines[cur].strip())
+        cur += 1
+    result["link_descriptions"] = listify(link_descriptions)
+
+    result["hyperlinks"] = extract_links(source + "/" + file_name)
+
+    images = []
+    captions = []
+    cur += 3
+    while cur + 1 < len(lines) and lines[cur] != "NOTES:":
+        images.append(lines[cur])
+        captions.append(lines[cur + 1])
+        cur += 2
+    result["images"] = listify(images)
+    result["captions"] = listify(captions)
+
+    notes = []
+    if (cur < len(lines) and lines[cur] == "NOTES:"):
+        cur += 1
+        while cur < len(lines):
+            notes.append(lines[cur])
+            cur += 1
+    if len(notes) > 0:
+        result["notes"] = listify(notes)
+
+    print("writing child schema...")
+
+    return {
+        "schema": {
+            "_id": guid(),
+            "fields": result,
+            "__type": "Doc"
+        },
+        "child_guids": view_guids
+    }
+
+
+def proxify_guids(guids):
+    return list(map(lambda guid: {"fieldId": guid, "__type": "proxy"}, guids))
+
+
+if os.path.exists(dist):
+    shutil.rmtree(dist)
+while os.path.exists(dist):
+    pass
+os.mkdir(dist)
+mkdir_if_absent(source)
+
+candidates = 0
+for file_name in os.listdir(source):
+    if file_name.endswith('.docx'):
+        candidates += 1
+        schema_guids.append(write_schema(
+            parse_document(file_name), ["title", "data"]))
+
+print("writing parent schema...")
+parent_guid = write_schema({
+    "schema": {
+        "_id": guid(),
+        "fields": {},
+        "__type": "Doc"
+    },
+    "child_guids": schema_guids
+}, ["title", "short_description", "original_price"])
+
+print("appending parent schema to main workspace...\n")
+db.newDocuments.update_one(
+    {"fields.title": "WS collection 1"},
+    {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}}
+)
+
+print("rewriting .gitignore...\n")
+lines = ['*', '!.gitignore']
+with open(dist + "/.gitignore", 'w') as f:
+    f.write('\n'.join(lines))
+
+suffix = "" if candidates == 1 else "s"
+print(f"conversion complete. {candidates} candidate{suffix} processed.")
author	Sam Wilkins <samuel_wilkins@brown.edu>	2019-06-26 14:55:38 -0400
committer	Sam Wilkins <samuel_wilkins@brown.edu>	2019-06-26 14:55:38 -0400
commit	d564601da06b696f59b97bf162fa52354d49f8c9 (patch)
tree	1573dbab01fa2127ef0971b4f1c25f4c6fa39330 /src/scraping/buxton/scraper.py
parent	7b38962bf658e998c33cca0760eeba4a4945332a (diff)