From c390449ce566249b0f947c60da89bc694672e647 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Sun, 2 Feb 2020 09:58:07 -0500 Subject: restructuring buxton source --- src/scraping/buxton/scraper.py | 414 +++++++++++------------------------------ 1 file changed, 111 insertions(+), 303 deletions(-) (limited to 'src/scraping/buxton/scraper.py') diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index ec9c3f72c..394958823 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -1,37 +1,32 @@ import os -from shutil import copyfile import docx2txt from docx import Document from docx.opc.constants import RELATIONSHIP_TYPE as RT import re -from pymongo import MongoClient import shutil import uuid -import datetime +import json +import base64 +from shutil import copyfile from PIL import Image -import math -import sys - -source = "./source" -filesPath = "../../server/public/files" -image_dist = filesPath + "/images/buxton" -db = MongoClient("localhost", 27017)["Dash"] -target_collection = db.newDocuments -target_doc_title = "Collection 1" -schema_guids = [] -common_proto_id = "" +files_path = "../../server/public/files" +source_path = "./source" +temp_images_path = "./extracted_images" +server_images_path = f"{files_path}/images/buxton" +json_path = "./json" -def extract_links(fileName): +# noinspection PyProtectedMember +def extract_links(file): links = [] - doc = Document(fileName) + doc = Document(file) rels = doc.part.rels for rel in rels: item = rels[rel] if item.reltype == RT.HYPERLINK and ".aspx" not in item._target: links.append(item._target) - return text_doc_map(links) + return links def extract_value(kv_string): @@ -51,233 +46,72 @@ def guid(): return str(uuid.uuid4()) -def listify(list): - return { - "fields": list, - "__type": "list" - } - - -def protofy(fieldId): - return { - "fieldId": fieldId, - "__type": "proxy" - } - - -def text_doc_map(string_list): - def guid_map(caption): - return write_text_doc(caption) - return listify(proxify_guids(list(map(guid_map, string_list)))) - - -def write_collection(parse_results, display_fields, storage_key, viewType): - view_guids = parse_results["child_guids"] - - data_doc = parse_results["schema"] - fields = data_doc["fields"] - - view_doc_guid = guid() - - view_doc = { - "_id": view_doc_guid, - "fields": { - "proto": protofy(data_doc["_id"]), - "x": 10, - "y": 10, - "_width": 900, - "_height": 600, - "_panX": 0, - "_panY": 0, - "zIndex": 2, - "libraryBrush": False, - "_viewType": viewType, - "_LODdisable": True - }, - "__type": "Doc" - } - - fields["proto"] = protofy(common_proto_id) - fields[storage_key] = listify(proxify_guids(view_guids)) - fields["schemaColumns"] = listify(display_fields) - fields["author"] = "Bill Buxton" - fields["creationDate"] = { - "date": datetime.datetime.utcnow().microsecond, - "__type": "date" - } - if "image_urls" in parse_results: - fields["hero"] = { - "url": parse_results["image_urls"][0], - "__type": "image" - } - fields["isPrototype"] = True - - target_collection.insert_one(data_doc) - target_collection.insert_one(view_doc) - - data_doc_guid = data_doc["_id"] - print(f"inserted view document ({view_doc_guid})") - print(f"inserted data document ({data_doc_guid})\n") - - return view_doc_guid - - -def write_text_doc(content): - data_doc_guid = guid() - view_doc_guid = guid() - - view_doc = { - "_id": view_doc_guid, - "fields": { - "proto": protofy(data_doc_guid), - "x": 10, - "y": 10, - "_width": 400, - "zIndex": 2 - }, - "__type": "Doc" - } - - data_doc = { - "_id": data_doc_guid, - "fields": { - "proto": protofy("textProto"), - "data": { - "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}', - "__type": "RichTextField" - }, - "title": content, - "_nativeWidth": 200, - "author": "Bill Buxton", - "creationDate": { - "date": datetime.datetime.utcnow().microsecond, - "__type": "date" - }, - "isPrototype": True, - "_autoHeight": True, - "page": -1, - "_nativeHeight": 200, - "_height": 200, - "data_text": content - }, - "__type": "Doc" - } - - target_collection.insert_one(view_doc) - target_collection.insert_one(data_doc) - - return view_doc_guid - - -def write_image(folder, name): - path = f"http://localhost:1050/files/images/buxton/{folder}/{name}" - - data_doc_guid = guid() - view_doc_guid = guid() - - image = Image.open(f"{image_dist}/{folder}/{name}") - native_width, native_height = image.size - - if abs(native_width - native_height) < 10: - return None - - view_doc = { - "_id": view_doc_guid, - "fields": { - "proto": protofy(data_doc_guid), - "x": 10, - "y": 10, - "_width": min(800, native_width), - "zIndex": 2, - "widthUnit": "*", - "widthMagnitude": 1 - }, - "__type": "Doc" - } - - data_doc = { - "_id": data_doc_guid, - "fields": { - "proto": protofy("imageProto"), - "data": { - "url": path, - "__type": "image" - }, - "title": name, - "_nativeWidth": native_width, - "author": "Bill Buxton", - "creationDate": { - "date": datetime.datetime.utcnow().microsecond, - "__type": "date" - }, - "isPrototype": True, - "page": -1, - "_nativeHeight": native_height, - "_height": native_height - }, - "__type": "Doc" - } - - target_collection.insert_one(view_doc) - target_collection.insert_one(data_doc) - - return { - "layout_id": view_doc_guid, - "url": path - } - - -def parse_document(file_name: str): - print(f"parsing {file_name}...") - pure_name = file_name.split(".")[0] +def encode_image(folder: str, name: str): + with open(f"{temp_images_path}/{folder}/{name}", "rb") as image: + encoded = base64.b64encode(image.read()) + return encoded.decode("utf-8") + + +def parse_document(name: str): + print(f"parsing {name}...") + pure_name = name.split(".")[0] result = {} - dir_path = image_dist + "/" + pure_name - print(dir_path) - mkdir_if_absent(dir_path) - - raw = str(docx2txt.process(source + "/" + file_name, dir_path)) - - urls = [] - view_guids = [] - count = 0 - for image in os.listdir(dir_path): - created = write_image(pure_name, image) - if created != None: - urls.append(created["url"]) - view_guids.append(created["layout_id"]) - count += 1 - resolved = dir_path + "/" + image - original = dir_path + "/" + image.replace(".", "_o.", 1) - medium = dir_path + "/" + image.replace(".", "_m.", 1) - copyfile(resolved, original) - copyfile(resolved, medium) - print(f"extracted {count} images...") + saved_device_images_dir = server_images_path + "/" + pure_name + temp_device_images_dir = temp_images_path + "/" + pure_name + mkdir_if_absent(temp_device_images_dir) + mkdir_if_absent(saved_device_images_dir) + + raw = str(docx2txt.process(source_path + + "/" + name, temp_device_images_dir)) + + extracted_images = [] + for image in os.listdir(temp_device_images_dir): + temp = f"{temp_device_images_dir}/{image}" + native_width, native_height = Image.open(temp).size + if abs(native_width - native_height) < 10: + continue + original = saved_device_images_dir + "/" + image.replace(".", "_o.", 1) + medium = saved_device_images_dir + "/" + image.replace(".", "_m.", 1) + copyfile(temp, original) + copyfile(temp, medium) + server_path = f"http://localhost:1050/files/images/buxton/{pure_name}/{image}" + extracted_images.append(server_path) + result["extracted_images"] = extracted_images def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip() - def sanitize_price(raw: str): - raw = raw.replace(",", "") - start = raw.find("$") + def sanitize_price(raw_price: str): + raw_price = raw_price.replace(",", "") + start = raw_price.find("$") + if "x" in raw_price.lower(): + return None if start > -1: i = start + 1 - while (i < len(raw) and re.match(r"[0-9\.]", raw[i])): + while i < len(raw_price) and re.match(r"[0-9.]", raw_price[i]): i += 1 - price = raw[start + 1: i + 1] + price = raw_price[start + 1: i + 1] return float(price) - elif (raw.lower().find("nfs")): + elif raw_price.lower().find("nfs"): return -1 else: - return math.nan + return None def remove_empty(line): return len(line) > 1 + def try_parse(to_parse: int): + value: int + try: + value = int(to_parse) + except ValueError: + value = None + return value + lines = list(map(sanitize, raw.split("\n"))) lines = list(filter(remove_empty, lines)) - result["file_name"] = file_name result["title"] = lines[2].strip() result["short_description"] = lines[3].strip().replace( "Short Description: ", "") @@ -293,13 +127,15 @@ def parse_document(file_name: str): clean = list( map(lambda data: data.strip().split(":"), lines[cur].split("|"))) result["company"] = clean[0][len(clean[0]) - 1].strip() - result["year"] = clean[1][len(clean[1]) - 1].strip() + + result["year"] = try_parse(clean[1][len(clean[1]) - 1].strip()) result["original_price"] = sanitize_price( clean[2][len(clean[2]) - 1].strip()) cur += 1 - result["degrees_of_freedom"] = extract_value( - lines[cur]).replace("NA", "N/A") + + result["degrees_of_freedom"] = try_parse(extract_value( + lines[cur]).replace("NA", "N/A")) cur += 1 dimensions = lines[cur].lower() @@ -325,99 +161,71 @@ def parse_document(file_name: str): cur += 1 link_descriptions = [] while lines[cur] != "Image": - link_descriptions.append(lines[cur].strip()) + description = lines[cur].strip().lower() + valid = True + for ignored in ["powerpoint", "vimeo", "xxx"]: + if ignored in description: + valid = False + break + if valid: + link_descriptions.append(description) cur += 1 - result["link_descriptions"] = text_doc_map(link_descriptions) + result["link_descriptions"] = link_descriptions - result["hyperlinks"] = extract_links(source + "/" + file_name) + result["hyperlinks"] = extract_links(source_path + "/" + name) images = [] captions = [] cur += 3 while cur + 1 < len(lines) and lines[cur] != "NOTES:": - images.append(lines[cur]) - captions.append(lines[cur + 1]) + name = lines[cur] + if "full document" not in name.lower(): + images.append(name) + captions.append(lines[cur + 1]) cur += 2 - result["images"] = listify(images) + result["table_image_names"] = images - result["captions"] = text_doc_map(captions) + result["captions"] = captions notes = [] - if (cur < len(lines) and lines[cur] == "NOTES:"): + if cur < len(lines) and lines[cur] == "NOTES:": cur += 1 while cur < len(lines): notes.append(lines[cur]) cur += 1 if len(notes) > 0: - result["notes"] = listify(notes) - - print("writing child schema...") - - return { - "schema": { - "_id": guid(), - "fields": result, - "__type": "Doc" - }, - "child_guids": view_guids, - "image_urls": urls - } - - -def proxify_guids(guids): - return list(map(lambda guid: {"fieldId": guid, "__type": "prefetch_proxy"}, guids)) - - -def write_common_proto(): - id = guid() - common_proto = { - "_id": id, - "fields": { - "proto": protofy("collectionProto"), - "title": "The Buxton Collection", - }, - "__type": "Doc" - } - target_collection.insert_one(common_proto) - return id - - -if os.path.exists(image_dist): - shutil.rmtree(image_dist) -while os.path.exists(image_dist): + result["notes"] = notes + + return result + + +if os.path.exists(server_images_path): + shutil.rmtree(server_images_path) +while os.path.exists(server_images_path): pass -os.mkdir(image_dist) -mkdir_if_absent(source) +os.mkdir(server_images_path) -common_proto_id = write_common_proto() +mkdir_if_absent(source_path) +mkdir_if_absent(json_path) +mkdir_if_absent(temp_images_path) + +results = [] candidates = 0 -for file_name in os.listdir(source): - if file_name.endswith('.docx'): +for file_name in os.listdir(source_path): + if file_name.endswith('.docx') or file_name.endswith(".doc"): candidates += 1 - schema_guids.append(write_collection( - parse_document(file_name), ["title", "data"], "data", 5)) - -print("writing parent schema...") -parent_guid = write_collection({ - "schema": { - "_id": guid(), - "fields": {}, - "__type": "Doc" - }, - "child_guids": schema_guids -}, ["title", "short_description", "original_price"], "data", 2) - -print("appending parent schema to main workspace...\n") -target_collection.update_one( - {"fields.title": target_doc_title}, - {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}} -) - -print("rewriting .gitignore...\n") -lines = ['*', '!.gitignore'] -with open(filesPath + "/.gitignore", 'w') as f: - f.write('\n'.join(lines)) - -suffix = "" if candidates == 1 else "s" -print(f"conversion complete. {candidates} candidate{suffix} processed.") + results.append(parse_document(file_name)) + + +with open(f"./json/buxton_collection.json", "w", encoding="utf-8") as out: + json.dump(results, out, ensure_ascii=False, indent=4) + +print(f"\nSuccessfully parsed {candidates} candidates.") + +print("\nrewriting .gitignore...") +entries = ['*', '!.gitignore'] +with open(files_path + "/.gitignore", 'w') as f: + f.write('\n'.join(entries)) + +shutil.rmtree(temp_images_path) -- cgit v1.2.3-70-g09d2 From 90d6454c05cdeb109da25dd55d428c140defca49 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Sun, 2 Feb 2020 12:46:57 -0500 Subject: fixed scraper --- src/scraping/buxton/.idea/workspace.xml | 46 ++- src/scraping/buxton/narratives.py | 38 ++ .../buxton/narratives/Theme - Chord Kbds.docx | Bin 0 -> 5701815 bytes .../buxton/narratives/chord_keyboards.json | 39 ++ src/scraping/buxton/scraper.py | 399 ++++++++++++++++----- 5 files changed, 411 insertions(+), 111 deletions(-) create mode 100644 src/scraping/buxton/narratives.py create mode 100644 src/scraping/buxton/narratives/Theme - Chord Kbds.docx create mode 100644 src/scraping/buxton/narratives/chord_keyboards.json (limited to 'src/scraping/buxton/scraper.py') diff --git a/src/scraping/buxton/.idea/workspace.xml b/src/scraping/buxton/.idea/workspace.xml index b2c7d4b8c..6f1ae3814 100644 --- a/src/scraping/buxton/.idea/workspace.xml +++ b/src/scraping/buxton/.idea/workspace.xml @@ -126,7 +126,7 @@ - + + + + @@ -188,30 +210,30 @@ - + - + - + - - + + - - + + - - + + - + diff --git a/src/scraping/buxton/narratives.py b/src/scraping/buxton/narratives.py new file mode 100644 index 000000000..947d60f91 --- /dev/null +++ b/src/scraping/buxton/narratives.py @@ -0,0 +1,38 @@ +from docx import Document +import tempfile +from zipfile import ZipFile +import shutil +from pathlib import Path +from os import mkdir + +path = "./narratives/Theme - Chord Kbds.docx" +doc = Document(path) + +# IMAGE_EXT = ('png', 'jpeg', 'jpg') +# +# with tempfile.TemporaryDirectory() as working_dir: +# with ZipFile(path) as working_zip: +# image_list = [name for name in working_zip.namelist() if any(name.endswith(ext) for ext in IMAGE_EXT)] +# working_zip.extractall(working_dir, image_list) +# mkdir("./test") +# for image in image_list: +# shutil.copy(Path(working_dir).resolve() / image, "./test") + +paragraphs = doc.paragraphs +for i in range(len(paragraphs)): + print(f"{i}: {paragraphs[i].text}") + +# for section in doc.sections: +# print(section.orientation) + +# for shape in doc.inline_shapes: +# print(shape._inline) + +# images = doc.tables[0] +# for row in images.rows: +# contents = [] +# for cell in row.cells: +# contents.append(cell.text) + # print(contents) + + diff --git a/src/scraping/buxton/narratives/Theme - Chord Kbds.docx b/src/scraping/buxton/narratives/Theme - Chord Kbds.docx new file mode 100644 index 000000000..439a7d975 Binary files /dev/null and b/src/scraping/buxton/narratives/Theme - Chord Kbds.docx differ diff --git a/src/scraping/buxton/narratives/chord_keyboards.json b/src/scraping/buxton/narratives/chord_keyboards.json new file mode 100644 index 000000000..748578769 --- /dev/null +++ b/src/scraping/buxton/narratives/chord_keyboards.json @@ -0,0 +1,39 @@ +{ + "slides": [{ + "text": "Theme: Chord Keyboards\nFrom music to type\n\nChord keyboards require 2 or more keys to be simultaneously pushed to spawn the intended output. Playing a chord on a piano or pushing both the shift + a letter key on a typewriter to enter an upper case character are examples.", + "devices": ["Casio CZ-101"] + }, + { + "text": "This is an early mechanical keyboard for taking dictation. Instead of typing alphanumeric characters as on a typewriter, pressing different combinations prints shorthand symbols on the tape, each representing a different phoneme. Speech is easier to keep up with this way, since each phoneme typically represents multiple characters.\n\nThe downside – until AI came to the rescue – was that it then took hours to manually transcribe to shorthand into conventional readable text.", + "devices": ["Grandjean Sténotype"] + }, + { + "text": "Designed and manufactured in the DDR, the purpose of this keyboard is to emboss dots representing Braille symbols onto paper. The effect is to enable blind users to use their tactile sensitivity to read with their fingers.\n\nEach Braille symbol consists of two columns of 3 embossed dots each. Which 3 dots are embossed in each column is determined by which of the three keys on either side are simultaneously pressed. The key in the middle, operated by either thumb, enters a space.", + "devices": ["Braille Writer"] + }, + { + "text": "This combination is derived from the work of the inventor of the mouse, Doug Engelbart\n\nWhile these are 2 distinct devices, they are not what they appear to be.\n\nFunctionally, there is a virtual 7-button chord keyboard, employing the 5 buttons on the keyset and the middle and right button of the mouse. And, using the left mouse button, there is also a 1-button mouse\n\nText was entered using a minor variant of 7-bit ASCII. The intent was to enable entering small bits of text without moving back-and-forth between mouse and QWERTY keyboard. It didn’t catch on.", + "devices": ["Xerox PARC 5-Button Keyset & 3-Button Mouse"] + }, + { + "text": "", + "devices": [] + }, + { + "text": "", + "devices": [] + }, + { + "text": "", + "devices": [] + }, + { + "text": "", + "devices": [] + }, + { + "text": "", + "devices": [] + } + ] +} \ No newline at end of file diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 394958823..f7a38112d 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -1,32 +1,36 @@ import os +from shutil import copyfile import docx2txt from docx import Document from docx.opc.constants import RELATIONSHIP_TYPE as RT import re +from pymongo import MongoClient import shutil import uuid -import json -import base64 -from shutil import copyfile +import datetime from PIL import Image +import math + +source = "./source" +filesPath = "../../server/public/files" +image_dist = filesPath + "/images/buxton" -files_path = "../../server/public/files" -source_path = "./source" -temp_images_path = "./extracted_images" -server_images_path = f"{files_path}/images/buxton" -json_path = "./json" +db = MongoClient("localhost", 27017)["Dash"] +target_collection = db.newDocuments +target_doc_title = "Collection 1" +schema_guids = [] +common_proto_id = "" -# noinspection PyProtectedMember -def extract_links(file): +def extract_links(fileName): links = [] - doc = Document(file) + doc = Document(fileName) rels = doc.part.rels for rel in rels: item = rels[rel] if item.reltype == RT.HYPERLINK and ".aspx" not in item._target: links.append(item._target) - return links + return text_doc_map(links) def extract_value(kv_string): @@ -46,58 +50,228 @@ def guid(): return str(uuid.uuid4()) -def encode_image(folder: str, name: str): - with open(f"{temp_images_path}/{folder}/{name}", "rb") as image: - encoded = base64.b64encode(image.read()) - return encoded.decode("utf-8") - - -def parse_document(name: str): - print(f"parsing {name}...") - pure_name = name.split(".")[0] +def listify(list): + return { + "fields": list, + "__type": "list" + } + + +def protofy(fieldId): + return { + "fieldId": fieldId, + "__type": "proxy" + } + + +def text_doc_map(string_list): + def guid_map(caption): + return write_text_doc(caption) + return listify(proxify_guids(list(map(guid_map, string_list)))) + + +def write_collection(parse_results, display_fields, storage_key, viewType): + view_guids = parse_results["child_guids"] + + data_doc = parse_results["schema"] + fields = data_doc["fields"] + + view_doc_guid = guid() + + view_doc = { + "_id": view_doc_guid, + "fields": { + "proto": protofy(data_doc["_id"]), + "x": 10, + "y": 10, + "_width": 900, + "_height": 600, + "_panX": 0, + "_panY": 0, + "zIndex": 2, + "libraryBrush": False, + "_viewType": viewType, + "_LODdisable": True + }, + "__type": "Doc" + } + + fields["proto"] = protofy(common_proto_id) + fields[storage_key] = listify(proxify_guids(view_guids)) + fields["schemaColumns"] = listify(display_fields) + fields["author"] = "Bill Buxton" + fields["creationDate"] = { + "date": datetime.datetime.utcnow().microsecond, + "__type": "date" + } + if "image_urls" in parse_results: + fields["hero"] = { + "url": parse_results["image_urls"][0], + "__type": "image" + } + fields["isPrototype"] = True + + target_collection.insert_one(data_doc) + target_collection.insert_one(view_doc) + + data_doc_guid = data_doc["_id"] + print(f"inserted view document ({view_doc_guid})") + print(f"inserted data document ({data_doc_guid})\n") + + return view_doc_guid + + +def write_text_doc(content): + data_doc_guid = guid() + view_doc_guid = guid() + + view_doc = { + "_id": view_doc_guid, + "fields": { + "proto": protofy(data_doc_guid), + "x": 10, + "y": 10, + "_width": 400, + "zIndex": 2 + }, + "__type": "Doc" + } + + data_doc = { + "_id": data_doc_guid, + "fields": { + "proto": protofy("textProto"), + "data": { + "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}', + "__type": "RichTextField" + }, + "title": content, + "_nativeWidth": 200, + "author": "Bill Buxton", + "creationDate": { + "date": datetime.datetime.utcnow().microsecond, + "__type": "date" + }, + "isPrototype": True, + "_autoHeight": True, + "page": -1, + "_nativeHeight": 200, + "_height": 200, + "data_text": content + }, + "__type": "Doc" + } + + target_collection.insert_one(view_doc) + target_collection.insert_one(data_doc) + + return view_doc_guid + + +def write_image(folder, name): + path = f"http://localhost:1050/files/images/buxton/{folder}/{name}" + + data_doc_guid = guid() + view_doc_guid = guid() + + image = Image.open(f"{image_dist}/{folder}/{name}") + native_width, native_height = image.size + + if abs(native_width - native_height) < 10: + return None + + view_doc = { + "_id": view_doc_guid, + "fields": { + "proto": protofy(data_doc_guid), + "x": 10, + "y": 10, + "_width": min(800, native_width), + "zIndex": 2, + "widthUnit": "*", + "widthMagnitude": 1 + }, + "__type": "Doc" + } + + data_doc = { + "_id": data_doc_guid, + "fields": { + "proto": protofy("imageProto"), + "data": { + "url": path, + "__type": "image" + }, + "title": name, + "_nativeWidth": native_width, + "author": "Bill Buxton", + "creationDate": { + "date": datetime.datetime.utcnow().microsecond, + "__type": "date" + }, + "isPrototype": True, + "page": -1, + "_nativeHeight": native_height, + "_height": native_height + }, + "__type": "Doc" + } + + target_collection.insert_one(view_doc) + target_collection.insert_one(data_doc) + + return { + "layout_id": view_doc_guid, + "url": path + } + + +def parse_document(file_name: str): + print(f"parsing {file_name}...") + pure_name = file_name.split(".")[0] result = {} - saved_device_images_dir = server_images_path + "/" + pure_name - temp_device_images_dir = temp_images_path + "/" + pure_name - mkdir_if_absent(temp_device_images_dir) - mkdir_if_absent(saved_device_images_dir) - - raw = str(docx2txt.process(source_path + - "/" + name, temp_device_images_dir)) - - extracted_images = [] - for image in os.listdir(temp_device_images_dir): - temp = f"{temp_device_images_dir}/{image}" - native_width, native_height = Image.open(temp).size - if abs(native_width - native_height) < 10: - continue - original = saved_device_images_dir + "/" + image.replace(".", "_o.", 1) - medium = saved_device_images_dir + "/" + image.replace(".", "_m.", 1) - copyfile(temp, original) - copyfile(temp, medium) - server_path = f"http://localhost:1050/files/images/buxton/{pure_name}/{image}" - extracted_images.append(server_path) - result["extracted_images"] = extracted_images + dir_path = image_dist + "/" + pure_name + print(dir_path) + mkdir_if_absent(dir_path) + + raw = str(docx2txt.process(source + "/" + file_name, dir_path)) + + urls = [] + view_guids = [] + count = 0 + for image in os.listdir(dir_path): + created = write_image(pure_name, image) + if created != None: + urls.append(created["url"]) + view_guids.append(created["layout_id"]) + count += 1 + resolved = dir_path + "/" + image + original = dir_path + "/" + image.replace(".", "_o.", 1) + medium = dir_path + "/" + image.replace(".", "_m.", 1) + copyfile(resolved, original) + copyfile(resolved, medium) + print(f"extracted {count} images...") def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip() - def sanitize_price(raw_price: str): - raw_price = raw_price.replace(",", "") - start = raw_price.find("$") - if "x" in raw_price.lower(): + def sanitize_price(raw: str): + raw = raw.replace(",", "") + if "x" in raw.lower(): return None + start = raw.find("$") if start > -1: i = start + 1 - while i < len(raw_price) and re.match(r"[0-9.]", raw_price[i]): + while (i < len(raw) and re.match(r"[0-9\.]", raw[i])): i += 1 - price = raw_price[start + 1: i + 1] + price = raw[start + 1: i + 1] return float(price) - elif raw_price.lower().find("nfs"): + elif (raw.lower().find("nfs")): return -1 else: - return None + return math.nan def remove_empty(line): return len(line) > 1 @@ -112,6 +286,7 @@ def parse_document(name: str): lines = list(map(sanitize, raw.split("\n"))) lines = list(filter(remove_empty, lines)) + result["file_name"] = file_name result["title"] = lines[2].strip() result["short_description"] = lines[3].strip().replace( "Short Description: ", "") @@ -127,13 +302,11 @@ def parse_document(name: str): clean = list( map(lambda data: data.strip().split(":"), lines[cur].split("|"))) result["company"] = clean[0][len(clean[0]) - 1].strip() - result["year"] = try_parse(clean[1][len(clean[1]) - 1].strip()) result["original_price"] = sanitize_price( clean[2][len(clean[2]) - 1].strip()) cur += 1 - result["degrees_of_freedom"] = try_parse(extract_value( lines[cur]).replace("NA", "N/A")) cur += 1 @@ -161,71 +334,99 @@ def parse_document(name: str): cur += 1 link_descriptions = [] while lines[cur] != "Image": - description = lines[cur].strip().lower() - valid = True - for ignored in ["powerpoint", "vimeo", "xxx"]: - if ignored in description: - valid = False - break - if valid: - link_descriptions.append(description) + link_descriptions.append(lines[cur].strip()) cur += 1 - result["link_descriptions"] = link_descriptions + result["link_descriptions"] = text_doc_map(link_descriptions) - result["hyperlinks"] = extract_links(source_path + "/" + name) + result["hyperlinks"] = extract_links(source + "/" + file_name) images = [] captions = [] cur += 3 while cur + 1 < len(lines) and lines[cur] != "NOTES:": - name = lines[cur] - if "full document" not in name.lower(): - images.append(name) - captions.append(lines[cur + 1]) + images.append(lines[cur]) + captions.append(lines[cur + 1]) cur += 2 - result["table_image_names"] = images + result["images"] = listify(images) - result["captions"] = captions + result["captions"] = text_doc_map(captions) notes = [] - if cur < len(lines) and lines[cur] == "NOTES:": + if (cur < len(lines) and lines[cur] == "NOTES:"): cur += 1 while cur < len(lines): notes.append(lines[cur]) cur += 1 if len(notes) > 0: - result["notes"] = notes - - return result - - -if os.path.exists(server_images_path): - shutil.rmtree(server_images_path) -while os.path.exists(server_images_path): + result["notes"] = listify(notes) + + print("writing child schema...") + + return { + "schema": { + "_id": guid(), + "fields": result, + "__type": "Doc" + }, + "child_guids": view_guids, + "image_urls": urls + } + + +def proxify_guids(guids): + return list(map(lambda guid: {"fieldId": guid, "__type": "prefetch_proxy"}, guids)) + + +def write_common_proto(): + id = guid() + common_proto = { + "_id": id, + "fields": { + "proto": protofy("collectionProto"), + "title": "The Buxton Collection", + }, + "__type": "Doc" + } + target_collection.insert_one(common_proto) + return id + + +if os.path.exists(image_dist): + shutil.rmtree(image_dist) +while os.path.exists(image_dist): pass -os.mkdir(server_images_path) +os.mkdir(image_dist) +mkdir_if_absent(source) -mkdir_if_absent(source_path) -mkdir_if_absent(json_path) -mkdir_if_absent(temp_images_path) - -results = [] +common_proto_id = write_common_proto() candidates = 0 -for file_name in os.listdir(source_path): - if file_name.endswith('.docx') or file_name.endswith(".doc"): +for file_name in os.listdir(source): + if file_name.endswith('.docx') or file_name.endswith('.doc'): candidates += 1 - results.append(parse_document(file_name)) - - -with open(f"./json/buxton_collection.json", "w", encoding="utf-8") as out: - json.dump(results, out, ensure_ascii=False, indent=4) - -print(f"\nSuccessfully parsed {candidates} candidates.") - -print("\nrewriting .gitignore...") -entries = ['*', '!.gitignore'] -with open(files_path + "/.gitignore", 'w') as f: - f.write('\n'.join(entries)) - -shutil.rmtree(temp_images_path) + schema_guids.append(write_collection( + parse_document(file_name), ["title", "data"], "data", 5)) + +print("writing parent schema...") +parent_guid = write_collection({ + "schema": { + "_id": guid(), + "fields": {}, + "__type": "Doc" + }, + "child_guids": schema_guids +}, ["title", "short_description", "original_price"], "data", 2) + +print("appending parent schema to main workspace...\n") +target_collection.update_one( + {"fields.title": target_doc_title}, + {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}} +) + +print("rewriting .gitignore...\n") +lines = ['*', '!.gitignore'] +with open(filesPath + "/.gitignore", 'w') as f: + f.write('\n'.join(lines)) + +suffix = "" if candidates == 1 else "s" +print(f"conversion complete. {candidates} candidate{suffix} processed.") -- cgit v1.2.3-70-g09d2 From 646de60fc314198b97172c62f414ffb9576ffb98 Mon Sep 17 00:00:00 2001 From: bob Date: Mon, 3 Feb 2020 11:33:24 -0500 Subject: fixed timeline bug. made multirow/multicol use same fields. --- .../CollectionFreeFormLayoutEngines.tsx | 28 +++++++++++-------- .../collectionFreeForm/CollectionFreeFormView.tsx | 4 +-- .../CollectionMulticolumnView.tsx | 32 +++++++++++----------- .../CollectionMultirowView.tsx | 32 +++++++++++----------- .../collectionMulticolumn/MulticolumnResizer.tsx | 20 +++++++------- .../MulticolumnWidthLabel.tsx | 12 ++++---- .../collectionMulticolumn/MultirowHeightLabel.tsx | 12 ++++---- .../collectionMulticolumn/MultirowResizer.tsx | 20 +++++++------- src/scraping/buxton/scraper.py | 4 +-- 9 files changed, 84 insertions(+), 80 deletions(-) (limited to 'src/scraping/buxton/scraper.py') diff --git a/src/client/views/collections/collectionFreeForm/CollectionFreeFormLayoutEngines.tsx b/src/client/views/collections/collectionFreeForm/CollectionFreeFormLayoutEngines.tsx index f08c2506e..da0b51196 100644 --- a/src/client/views/collections/collectionFreeForm/CollectionFreeFormLayoutEngines.tsx +++ b/src/client/views/collections/collectionFreeForm/CollectionFreeFormLayoutEngines.tsx @@ -204,7 +204,21 @@ export function computeTimelineLayout( x += scaling * (key - prevKey); const stack = findStack(x, stacking); prevKey = key; - !stack && Math.abs(x - (curTime - minTime) * scaling) > pivotAxisWidth && groupNames.push({ type: "text", text: key.toString(), x: x, y: stack * 25, height: fontHeight, fontSize }); + !stack && (curTime === undefined || Math.abs(x - (curTime - minTime) * scaling) > pivotAxisWidth) && groupNames.push({ type: "text", text: key.toString(), x: x, y: stack * 25, height: fontHeight, fontSize }); + newFunction(keyDocs, key); + }); + if (sortedKeys.length && curTime > sortedKeys[sortedKeys.length - 1]) { + x = (curTime - minTime) * scaling; + groupNames.push({ type: "text", text: curTime.toString(), x: x, y: 0, zIndex: 1000, color: "orange", height: fontHeight, fontSize }); + } + if (Math.ceil(maxTime - minTime) * scaling > x + 25) { + groupNames.push({ type: "text", text: Math.ceil(maxTime).toString(), x: Math.ceil(maxTime - minTime) * scaling, y: 0, height: fontHeight, fontSize }); + } + + const divider = { type: "div", color: "black", x: 0, y: 0, width: panelDim[0], height: 1 } as any; + return normalizeResults(panelDim, fontHeight, childPairs, docMap, poolData, viewDefsToJSX, groupNames, (maxTime - minTime) * scaling, [divider]); + + function newFunction(keyDocs: Doc[], key: number) { keyDocs.forEach(doc => { const stack = findStack(x, stacking); const layoutDoc = Doc.Layout(doc); @@ -215,22 +229,12 @@ export function computeTimelineLayout( wid = layoutDoc._nativeHeight ? (NumCast(layoutDoc._nativeWidth) / NumCast(layoutDoc._nativeHeight)) * pivotAxisWidth : pivotAxisWidth; } docMap.set(doc, { - x: x, y: - Math.sqrt(stack) * pivotAxisWidth / 2 - pivotAxisWidth + (pivotAxisWidth - hgt) / 2, + x: x, y: -Math.sqrt(stack) * pivotAxisWidth / 2 - pivotAxisWidth + (pivotAxisWidth - hgt) / 2, zIndex: (curTime === key ? 1000 : zind++), highlight: curTime === key, width: wid / (Math.max(stack, 1)), height: hgt }); stacking[stack] = x + pivotAxisWidth; }); - }); - if (sortedKeys.length && curTime > sortedKeys[sortedKeys.length - 1]) { - x = (curTime - minTime) * scaling; - groupNames.push({ type: "text", text: curTime.toString(), x: x, y: 0, zIndex: 1000, color: "orange", height: fontHeight, fontSize }); } - if (Math.ceil(maxTime - minTime) * scaling > x + 25) { - groupNames.push({ type: "text", text: Math.ceil(maxTime).toString(), x: Math.ceil(maxTime - minTime) * scaling, y: 0, height: fontHeight, fontSize }); - } - - const divider = { type: "div", color: "black", x: 0, y: 0, width: panelDim[0], height: 1 } as any; - return normalizeResults(panelDim, fontHeight, childPairs, docMap, poolData, viewDefsToJSX, groupNames, (maxTime - minTime) * scaling, [divider]); } function normalizeResults(panelDim: number[], fontHeight: number, childPairs: { data?: Doc, layout: Doc }[], docMap: Map, diff --git a/src/client/views/collections/collectionFreeForm/CollectionFreeFormView.tsx b/src/client/views/collections/collectionFreeForm/CollectionFreeFormView.tsx index f1a239050..30ddd09e6 100644 --- a/src/client/views/collections/collectionFreeForm/CollectionFreeFormView.tsx +++ b/src/client/views/collections/collectionFreeForm/CollectionFreeFormView.tsx @@ -791,12 +791,12 @@ export class CollectionFreeFormView extends CollectionSubView(PanZoomDocument) { doTimelineLayout(poolData: ObservableMap) { return computeTimelineLayout(poolData, this.props.Document, this.childDocs, - this.childLayoutPairs.filter(pair => this.isCurrent(pair.layout)), [this.props.PanelWidth(), this.props.PanelHeight()], this.viewDefsToJSX); + this.childLayoutPairs, [this.props.PanelWidth(), this.props.PanelHeight()], this.viewDefsToJSX); } doPivotLayout(poolData: ObservableMap) { return computePivotLayout(poolData, this.props.Document, this.childDocs, - this.childLayoutPairs.filter(pair => this.isCurrent(pair.layout)), [this.props.PanelWidth(), this.props.PanelHeight()], this.viewDefsToJSX); + this.childLayoutPairs, [this.props.PanelWidth(), this.props.PanelHeight()], this.viewDefsToJSX); } doFreeformLayout(poolData: ObservableMap) { diff --git a/src/client/views/collections/collectionMulticolumn/CollectionMulticolumnView.tsx b/src/client/views/collections/collectionMulticolumn/CollectionMulticolumnView.tsx index 041eb69da..65862f34f 100644 --- a/src/client/views/collections/collectionMulticolumn/CollectionMulticolumnView.tsx +++ b/src/client/views/collections/collectionMulticolumn/CollectionMulticolumnView.tsx @@ -28,12 +28,12 @@ interface LayoutData { starSum: number; } -export const WidthUnit = { +export const DimUnit = { Pixel: "px", Ratio: "*" }; -const resolvedUnits = Object.values(WidthUnit); +const resolvedUnits = Object.values(DimUnit); const resizerWidth = 4; @observer @@ -45,12 +45,12 @@ export class CollectionMulticolumnView extends CollectionSubView(MulticolumnDocu */ @computed private get ratioDefinedDocs() { - return this.childLayoutPairs.map(({ layout }) => layout).filter(({ widthUnit }) => StrCast(widthUnit) === WidthUnit.Ratio); + return this.childLayoutPairs.map(({ layout }) => layout).filter(({ dimUnit }) => StrCast(dimUnit) === DimUnit.Ratio); } /** - * This loops through all childLayoutPairs and extracts the values for widthUnit - * and widthMagnitude, ignoring any that are malformed. Additionally, it then + * This loops through all childLayoutPairs and extracts the values for dimUnit + * and dimMagnitude, ignoring any that are malformed. Additionally, it then * normalizes the ratio values so that one * value is always 1, with the remaining * values proportionate to that easily readable metric. * @returns the list of the resolved width specifiers (unit and magnitude pairs) @@ -60,11 +60,11 @@ export class CollectionMulticolumnView extends CollectionSubView(MulticolumnDocu private get resolvedLayoutInformation(): LayoutData { let starSum = 0; const widthSpecifiers: WidthSpecifier[] = []; - this.childLayoutPairs.map(({ layout: { widthUnit, widthMagnitude } }) => { - const unit = StrCast(widthUnit); - const magnitude = NumCast(widthMagnitude); + this.childLayoutPairs.map(({ layout: { dimUnit, dimMagnitude } }) => { + const unit = StrCast(dimUnit); + const magnitude = NumCast(dimMagnitude); if (unit && magnitude && magnitude > 0 && resolvedUnits.includes(unit)) { - (unit === WidthUnit.Ratio) && (starSum += magnitude); + (unit === DimUnit.Ratio) && (starSum += magnitude); widthSpecifiers.push({ magnitude, unit }); } /** @@ -82,9 +82,9 @@ export class CollectionMulticolumnView extends CollectionSubView(MulticolumnDocu setTimeout(() => { const { ratioDefinedDocs } = this; if (this.childLayoutPairs.length) { - const minimum = Math.min(...ratioDefinedDocs.map(({ widthMagnitude }) => NumCast(widthMagnitude))); + const minimum = Math.min(...ratioDefinedDocs.map(({ dimMagnitude }) => NumCast(dimMagnitude))); if (minimum !== 0) { - ratioDefinedDocs.forEach(layout => layout.widthMagnitude = NumCast(layout.widthMagnitude) / minimum); + ratioDefinedDocs.forEach(layout => layout.dimMagnitude = NumCast(layout.dimMagnitude) / minimum); } } }); @@ -103,7 +103,7 @@ export class CollectionMulticolumnView extends CollectionSubView(MulticolumnDocu @computed private get totalFixedAllocation(): number | undefined { return this.resolvedLayoutInformation?.widthSpecifiers.reduce( - (sum, { magnitude, unit }) => sum + (unit === WidthUnit.Pixel ? magnitude : 0), 0); + (sum, { magnitude, unit }) => sum + (unit === DimUnit.Pixel ? magnitude : 0), 0); } /** @@ -160,8 +160,8 @@ export class CollectionMulticolumnView extends CollectionSubView(MulticolumnDocu if (columnUnitLength === undefined) { return 0; // we're still waiting on promises to resolve } - let width = NumCast(layout.widthMagnitude); - if (StrCast(layout.widthUnit) === WidthUnit.Ratio) { + let width = NumCast(layout.dimMagnitude); + if (StrCast(layout.dimUnit) === DimUnit.Ratio) { width *= columnUnitLength; } return width; @@ -193,8 +193,8 @@ export class CollectionMulticolumnView extends CollectionSubView(MulticolumnDocu drop = (e: Event, de: DragManager.DropEvent) => { if (super.drop(e, de)) { de.complete.docDragData?.droppedDocuments.forEach(action((d: Doc) => { - d.widthUnit = "*"; - d.widthMagnitude = 1; + d.dimUnit = "*"; + d.dimMagnitude = 1; })); } return false; diff --git a/src/client/views/collections/collectionMulticolumn/CollectionMultirowView.tsx b/src/client/views/collections/collectionMulticolumn/CollectionMultirowView.tsx index e07985bb4..aa440b677 100644 --- a/src/client/views/collections/collectionMulticolumn/CollectionMultirowView.tsx +++ b/src/client/views/collections/collectionMulticolumn/CollectionMultirowView.tsx @@ -28,12 +28,12 @@ interface LayoutData { starSum: number; } -export const HeightUnit = { +export const DimUnit = { Pixel: "px", Ratio: "*" }; -const resolvedUnits = Object.values(HeightUnit); +const resolvedUnits = Object.values(DimUnit); const resizerHeight = 4; @observer @@ -45,12 +45,12 @@ export class CollectionMultirowView extends CollectionSubView(MultirowDocument) */ @computed private get ratioDefinedDocs() { - return this.childLayoutPairs.map(({ layout }) => layout).filter(({ widthUnit }) => StrCast(widthUnit) === HeightUnit.Ratio); + return this.childLayoutPairs.map(({ layout }) => layout).filter(({ dimUnit }) => StrCast(dimUnit) === DimUnit.Ratio); } /** - * This loops through all childLayoutPairs and extracts the values for widthUnit - * and widthMagnitude, ignoring any that are malformed. Additionally, it then + * This loops through all childLayoutPairs and extracts the values for dimUnit + * and dimUnit, ignoring any that are malformed. Additionally, it then * normalizes the ratio values so that one * value is always 1, with the remaining * values proportionate to that easily readable metric. * @returns the list of the resolved width specifiers (unit and magnitude pairs) @@ -60,11 +60,11 @@ export class CollectionMultirowView extends CollectionSubView(MultirowDocument) private get resolvedLayoutInformation(): LayoutData { let starSum = 0; const heightSpecifiers: HeightSpecifier[] = []; - this.childLayoutPairs.map(({ layout: { heightUnit, heightMagnitude } }) => { - const unit = StrCast(heightUnit); - const magnitude = NumCast(heightMagnitude); + this.childLayoutPairs.map(({ layout: { dimUnit, dimMagnitude } }) => { + const unit = StrCast(dimUnit); + const magnitude = NumCast(dimMagnitude); if (unit && magnitude && magnitude > 0 && resolvedUnits.includes(unit)) { - (unit === HeightUnit.Ratio) && (starSum += magnitude); + (unit === DimUnit.Ratio) && (starSum += magnitude); heightSpecifiers.push({ magnitude, unit }); } /** @@ -82,9 +82,9 @@ export class CollectionMultirowView extends CollectionSubView(MultirowDocument) setTimeout(() => { const { ratioDefinedDocs } = this; if (this.childLayoutPairs.length) { - const minimum = Math.min(...ratioDefinedDocs.map(({ heightMagnitude }) => NumCast(heightMagnitude))); + const minimum = Math.min(...ratioDefinedDocs.map(({ dimMagnitude }) => NumCast(dimMagnitude))); if (minimum !== 0) { - ratioDefinedDocs.forEach(layout => layout.heightMagnitude = NumCast(layout.heightMagnitude) / minimum); + ratioDefinedDocs.forEach(layout => layout.dimMagnitude = NumCast(layout.dimMagnitude) / minimum); } } }); @@ -103,7 +103,7 @@ export class CollectionMultirowView extends CollectionSubView(MultirowDocument) @computed private get totalFixedAllocation(): number | undefined { return this.resolvedLayoutInformation?.heightSpecifiers.reduce( - (sum, { magnitude, unit }) => sum + (unit === HeightUnit.Pixel ? magnitude : 0), 0); + (sum, { magnitude, unit }) => sum + (unit === DimUnit.Pixel ? magnitude : 0), 0); } /** @@ -160,8 +160,8 @@ export class CollectionMultirowView extends CollectionSubView(MultirowDocument) if (rowUnitLength === undefined) { return 0; // we're still waiting on promises to resolve } - let height = NumCast(layout.heightMagnitude); - if (StrCast(layout.heightUnit) === HeightUnit.Ratio) { + let height = NumCast(layout.dimMagnitude); + if (StrCast(layout.dimUnit) === DimUnit.Ratio) { height *= rowUnitLength; } return height; @@ -193,8 +193,8 @@ export class CollectionMultirowView extends CollectionSubView(MultirowDocument) drop = (e: Event, de: DragManager.DropEvent) => { if (super.drop(e, de)) { de.complete.docDragData?.droppedDocuments.forEach(action((d: Doc) => { - d.heightUnit = "*"; - d.heightMagnitude = 1; + d.dimUnit = "*"; + d.dimMagnitude = 1; })); } return false; diff --git a/src/client/views/collections/collectionMulticolumn/MulticolumnResizer.tsx b/src/client/views/collections/collectionMulticolumn/MulticolumnResizer.tsx index 11e210958..46c39d817 100644 --- a/src/client/views/collections/collectionMulticolumn/MulticolumnResizer.tsx +++ b/src/client/views/collections/collectionMulticolumn/MulticolumnResizer.tsx @@ -3,7 +3,7 @@ import { observer } from "mobx-react"; import { observable, action } from "mobx"; import { Doc } from "../../../../new_fields/Doc"; import { NumCast, StrCast } from "../../../../new_fields/Types"; -import { WidthUnit } from "./CollectionMulticolumnView"; +import { DimUnit } from "./CollectionMulticolumnView"; interface ResizerProps { width: number; @@ -46,14 +46,14 @@ export default class ResizeBar extends React.Component { const unitLength = columnUnitLength(); if (unitLength) { if (toNarrow) { - const { widthUnit, widthMagnitude } = toNarrow; - const scale = widthUnit === WidthUnit.Ratio ? unitLength : 1; - toNarrow.widthMagnitude = NumCast(widthMagnitude) - Math.abs(movementX) / scale; + const { dimUnit, dimMagnitude } = toNarrow; + const scale = dimUnit === DimUnit.Ratio ? unitLength : 1; + toNarrow.dimMagnitude = NumCast(dimMagnitude) - Math.abs(movementX) / scale; } if (this.resizeMode === ResizeMode.Pinned && toWiden) { - const { widthUnit, widthMagnitude } = toWiden; - const scale = widthUnit === WidthUnit.Ratio ? unitLength : 1; - toWiden.widthMagnitude = NumCast(widthMagnitude) + Math.abs(movementX) / scale; + const { dimUnit, dimMagnitude } = toWiden; + const scale = dimUnit === DimUnit.Ratio ? unitLength : 1; + toWiden.dimMagnitude = NumCast(dimMagnitude) + Math.abs(movementX) / scale; } } } @@ -61,17 +61,17 @@ export default class ResizeBar extends React.Component { private get isActivated() { const { toLeft, toRight } = this.props; if (toLeft && toRight) { - if (StrCast(toLeft.widthUnit) === WidthUnit.Pixel && StrCast(toRight.widthUnit) === WidthUnit.Pixel) { + if (StrCast(toLeft.dimUnit) === DimUnit.Pixel && StrCast(toRight.dimUnit) === DimUnit.Pixel) { return false; } return true; } else if (toLeft) { - if (StrCast(toLeft.widthUnit) === WidthUnit.Pixel) { + if (StrCast(toLeft.dimUnit) === DimUnit.Pixel) { return false; } return true; } else if (toRight) { - if (StrCast(toRight.widthUnit) === WidthUnit.Pixel) { + if (StrCast(toRight.dimUnit) === DimUnit.Pixel) { return false; } return true; diff --git a/src/client/views/collections/collectionMulticolumn/MulticolumnWidthLabel.tsx b/src/client/views/collections/collectionMulticolumn/MulticolumnWidthLabel.tsx index b394fed62..5b2054428 100644 --- a/src/client/views/collections/collectionMulticolumn/MulticolumnWidthLabel.tsx +++ b/src/client/views/collections/collectionMulticolumn/MulticolumnWidthLabel.tsx @@ -4,7 +4,7 @@ import { computed } from "mobx"; import { Doc } from "../../../../new_fields/Doc"; import { NumCast, StrCast, BoolCast } from "../../../../new_fields/Types"; import { EditableView } from "../../EditableView"; -import { WidthUnit } from "./CollectionMulticolumnView"; +import { DimUnit } from "./CollectionMulticolumnView"; interface WidthLabelProps { layout: Doc; @@ -18,8 +18,8 @@ export default class WidthLabel extends React.Component { @computed private get contents() { const { layout, decimals } = this.props; - const getUnit = () => StrCast(layout.widthUnit); - const getMagnitude = () => String(+NumCast(layout.widthMagnitude).toFixed(decimals ?? 3)); + const getUnit = () => StrCast(layout.dimUnit); + const getMagnitude = () => String(+NumCast(layout.dimMagnitude).toFixed(decimals ?? 3)); return (
{ SetValue={value => { const converted = Number(value); if (!isNaN(converted) && converted > 0) { - layout.widthMagnitude = converted; + layout.dimMagnitude = converted; return true; } return false; @@ -37,8 +37,8 @@ export default class WidthLabel extends React.Component { { - if (Object.values(WidthUnit).includes(value)) { - layout.widthUnit = value; + if (Object.values(DimUnit).includes(value)) { + layout.dimUnit = value; return true; } return false; diff --git a/src/client/views/collections/collectionMulticolumn/MultirowHeightLabel.tsx b/src/client/views/collections/collectionMulticolumn/MultirowHeightLabel.tsx index 56a2e868d..899577fd5 100644 --- a/src/client/views/collections/collectionMulticolumn/MultirowHeightLabel.tsx +++ b/src/client/views/collections/collectionMulticolumn/MultirowHeightLabel.tsx @@ -4,7 +4,7 @@ import { computed } from "mobx"; import { Doc } from "../../../../new_fields/Doc"; import { NumCast, StrCast, BoolCast } from "../../../../new_fields/Types"; import { EditableView } from "../../EditableView"; -import { HeightUnit } from "./CollectionMultirowView"; +import { DimUnit } from "./CollectionMultirowView"; interface HeightLabelProps { layout: Doc; @@ -18,8 +18,8 @@ export default class HeightLabel extends React.Component { @computed private get contents() { const { layout, decimals } = this.props; - const getUnit = () => StrCast(layout.heightUnit); - const getMagnitude = () => String(+NumCast(layout.heightMagnitude).toFixed(decimals ?? 3)); + const getUnit = () => StrCast(layout.dimUnit); + const getMagnitude = () => String(+NumCast(layout.dimMagnitude).toFixed(decimals ?? 3)); return (
{ SetValue={value => { const converted = Number(value); if (!isNaN(converted) && converted > 0) { - layout.heightMagnitude = converted; + layout.dimMagnitude = converted; return true; } return false; @@ -37,8 +37,8 @@ export default class HeightLabel extends React.Component { { - if (Object.values(HeightUnit).includes(value)) { - layout.heightUnit = value; + if (Object.values(DimUnit).includes(value)) { + layout.dimUnit = value; return true; } return false; diff --git a/src/client/views/collections/collectionMulticolumn/MultirowResizer.tsx b/src/client/views/collections/collectionMulticolumn/MultirowResizer.tsx index 20c6cd3df..4f58f3fa8 100644 --- a/src/client/views/collections/collectionMulticolumn/MultirowResizer.tsx +++ b/src/client/views/collections/collectionMulticolumn/MultirowResizer.tsx @@ -3,7 +3,7 @@ import { observer } from "mobx-react"; import { observable, action } from "mobx"; import { Doc } from "../../../../new_fields/Doc"; import { NumCast, StrCast } from "../../../../new_fields/Types"; -import { HeightUnit } from "./CollectionMultirowView"; +import { DimUnit } from "./CollectionMultirowView"; interface ResizerProps { height: number; @@ -46,14 +46,14 @@ export default class ResizeBar extends React.Component { const unitLength = columnUnitLength(); if (unitLength) { if (toNarrow) { - const { heightUnit, heightMagnitude } = toNarrow; - const scale = heightUnit === HeightUnit.Ratio ? unitLength : 1; - toNarrow.heightMagnitude = NumCast(heightMagnitude) - Math.abs(movementY) / scale; + const { dimUnit, dimMagnitude } = toNarrow; + const scale = dimUnit === DimUnit.Ratio ? unitLength : 1; + toNarrow.dimMagnitude = NumCast(dimMagnitude) - Math.abs(movementY) / scale; } if (this.resizeMode === ResizeMode.Pinned && toWiden) { - const { heightUnit, heightMagnitude } = toWiden; - const scale = heightUnit === HeightUnit.Ratio ? unitLength : 1; - toWiden.heightMagnitude = NumCast(heightMagnitude) + Math.abs(movementY) / scale; + const { dimUnit, dimMagnitude } = toWiden; + const scale = dimUnit === DimUnit.Ratio ? unitLength : 1; + toWiden.dimMagnitude = NumCast(dimMagnitude) + Math.abs(movementY) / scale; } } } @@ -61,17 +61,17 @@ export default class ResizeBar extends React.Component { private get isActivated() { const { toTop, toBottom } = this.props; if (toTop && toBottom) { - if (StrCast(toTop.heightUnit) === HeightUnit.Pixel && StrCast(toBottom.heightUnit) === HeightUnit.Pixel) { + if (StrCast(toTop.dimUnit) === DimUnit.Pixel && StrCast(toBottom.dimUnit) === DimUnit.Pixel) { return false; } return true; } else if (toTop) { - if (StrCast(toTop.heightUnit) === HeightUnit.Pixel) { + if (StrCast(toTop.dimUnit) === DimUnit.Pixel) { return false; } return true; } else if (toBottom) { - if (StrCast(toBottom.heightUnit) === HeightUnit.Pixel) { + if (StrCast(toBottom.dimUnit) === DimUnit.Pixel) { return false; } return true; diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index f7a38112d..3375c1141 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -188,8 +188,8 @@ def write_image(folder, name): "y": 10, "_width": min(800, native_width), "zIndex": 2, - "widthUnit": "*", - "widthMagnitude": 1 + "dimUnit": "*", + "dimMagnitude": 1 }, "__type": "Doc" } -- cgit v1.2.3-70-g09d2 From 983f51b62f4b869bdb86fc4b708098d02f0d749d Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Mon, 3 Feb 2020 17:57:09 -0500 Subject: added base64 encodings support for image upload, removed logs from scraper.py --- src/client/views/collections/CollectionSubView.tsx | 7 +++++- src/scraping/buxton/scraper.py | 14 ++++++------ src/server/ApiManagers/DownloadManager.ts | 6 +++-- src/server/ApiManagers/GooglePhotosManager.ts | 9 ++++++-- src/server/ApiManagers/UploadManager.ts | 3 ++- src/server/ApiManagers/UtilManager.ts | 7 +++++- src/server/DashUploadUtils.ts | 26 ++++++++++++++++++---- 7 files changed, 54 insertions(+), 18 deletions(-) (limited to 'src/scraping/buxton/scraper.py') diff --git a/src/client/views/collections/CollectionSubView.tsx b/src/client/views/collections/CollectionSubView.tsx index 0eeb1c83d..9cdd48089 100644 --- a/src/client/views/collections/CollectionSubView.tsx +++ b/src/client/views/collections/CollectionSubView.tsx @@ -254,7 +254,12 @@ export function CollectionSubView(schemaCtor: (doc: Doc) => T) { const img = tags[0].startsWith("img") ? tags[0] : tags.length > 1 && tags[1].startsWith("img") ? tags[1] : ""; if (img) { const split = img.split("src=\"")[1].split("\"")[0]; - const doc = Docs.Create.ImageDocument(split, { ...options, _width: 300 }); + let source = split; + if (split.startsWith("data:image") && split.includes("base64")) { + const [{ clientAccessPath }] = await Networking.PostToServer("/uploadRemoteImage", { sources: [split] }); + source = Utils.prepend(clientAccessPath); + } + const doc = Docs.Create.ImageDocument(source, { ...options, _width: 300 }); ImageUtils.ExtractExif(doc); this.props.addDocument(doc); return; diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index f7a38112d..c502ac30c 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -115,8 +115,8 @@ def write_collection(parse_results, display_fields, storage_key, viewType): target_collection.insert_one(view_doc) data_doc_guid = data_doc["_id"] - print(f"inserted view document ({view_doc_guid})") - print(f"inserted data document ({data_doc_guid})\n") + # print(f"inserted view document ({view_doc_guid})") + # print(f"inserted data document ({data_doc_guid})\n") return view_doc_guid @@ -233,7 +233,7 @@ def parse_document(file_name: str): result = {} dir_path = image_dist + "/" + pure_name - print(dir_path) + # print(dir_path) mkdir_if_absent(dir_path) raw = str(docx2txt.process(source + "/" + file_name, dir_path)) @@ -252,7 +252,7 @@ def parse_document(file_name: str): medium = dir_path + "/" + image.replace(".", "_m.", 1) copyfile(resolved, original) copyfile(resolved, medium) - print(f"extracted {count} images...") + # print(f"extracted {count} images...") def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip() @@ -360,7 +360,7 @@ def parse_document(file_name: str): if len(notes) > 0: result["notes"] = listify(notes) - print("writing child schema...") + # print("writing child schema...") return { "schema": { @@ -392,7 +392,7 @@ def write_common_proto(): if os.path.exists(image_dist): - shutil.rmtree(image_dist) + shutil.rmtree(image_dist, True) while os.path.exists(image_dist): pass os.mkdir(image_dist) @@ -415,7 +415,7 @@ parent_guid = write_collection({ "__type": "Doc" }, "child_guids": schema_guids -}, ["title", "short_description", "original_price"], "data", 2) +}, ["title", "short_description", "original_price"], "data", 4) print("appending parent schema to main workspace...\n") target_collection.update_one( diff --git a/src/server/ApiManagers/DownloadManager.ts b/src/server/ApiManagers/DownloadManager.ts index 1bb84f374..fad5e6789 100644 --- a/src/server/ApiManagers/DownloadManager.ts +++ b/src/server/ApiManagers/DownloadManager.ts @@ -254,11 +254,13 @@ async function writeHierarchyRecursive(file: Archiver.Archiver, hierarchy: Hiera // and dropped in the browser and thus hosted remotely) so we upload it // to our server and point the zip file to it, so it can bundle up the bytes const information = await DashUploadUtils.UploadImage(result); - path = information.serverAccessPaths[SizeSuffix.Original]; + path = information instanceof Error ? "" : information.serverAccessPaths[SizeSuffix.Original]; } // write the file specified by the path to the directory in the // zip file given by the prefix. - file.file(path, { name: documentTitle, prefix }); + if (path) { + file.file(path, { name: documentTitle, prefix }); + } } else { // we've hit a collection, so we have to recurse await writeHierarchyRecursive(file, result, `${prefix}/${documentTitle}`); diff --git a/src/server/ApiManagers/GooglePhotosManager.ts b/src/server/ApiManagers/GooglePhotosManager.ts index 107542ce2..1727cc5a6 100644 --- a/src/server/ApiManagers/GooglePhotosManager.ts +++ b/src/server/ApiManagers/GooglePhotosManager.ts @@ -88,8 +88,13 @@ export default class GooglePhotosManager extends ApiManager { if (contents) { const completed: Opt[] = []; for (const item of contents.mediaItems) { - const { contentSize, ...attributes } = await DashUploadUtils.InspectImage(item.baseUrl); - const found: Opt = await Database.Auxiliary.QueryUploadHistory(contentSize!); + const results = await DashUploadUtils.InspectImage(item.baseUrl); + if (results instanceof Error) { + failed++; + continue; + } + const { contentSize, ...attributes } = results; + const found: Opt = await Database.Auxiliary.QueryUploadHistory(contentSize); if (!found) { const upload = await DashUploadUtils.UploadInspectedImage({ contentSize, ...attributes }, item.filename, prefix).catch(error => _error(res, downloadError, error)); if (upload) { diff --git a/src/server/ApiManagers/UploadManager.ts b/src/server/ApiManagers/UploadManager.ts index a92b613b7..4d09528f4 100644 --- a/src/server/ApiManagers/UploadManager.ts +++ b/src/server/ApiManagers/UploadManager.ts @@ -65,7 +65,8 @@ export default class UploadManager extends ApiManager { secureHandler: async ({ req, res }) => { const { sources } = req.body; if (Array.isArray(sources)) { - return res.send(await Promise.all(sources.map(url => DashUploadUtils.UploadImage(url)))); + const results = await Promise.all(sources.map(source => DashUploadUtils.UploadImage(source))); + return res.send(results); } res.send(); } diff --git a/src/server/ApiManagers/UtilManager.ts b/src/server/ApiManagers/UtilManager.ts index a0d0d0f4b..d7b085a30 100644 --- a/src/server/ApiManagers/UtilManager.ts +++ b/src/server/ApiManagers/UtilManager.ts @@ -47,7 +47,12 @@ export default class UtilManager extends ApiManager { const onResolved = (stdout: string) => { console.log(stdout); res.redirect("/"); }; const onRejected = (err: any) => { console.error(err.message); res.send(err); }; - const tryPython3 = () => command_line('python3 scraper.py', cwd).then(onResolved, onRejected); + const tryPython3 = (reason: any) => { + console.log("Initial scraper failed for the following reason:"); + console.log(red(reason.Error)); + console.log("Falling back to python3..."); + command_line('python3 scraper.py', cwd).then(onResolved, onRejected); + }; return command_line('python scraper.py', cwd).then(onResolved, tryPython3); }, diff --git a/src/server/DashUploadUtils.ts b/src/server/DashUploadUtils.ts index cb7104757..27c4bf854 100644 --- a/src/server/DashUploadUtils.ts +++ b/src/server/DashUploadUtils.ts @@ -1,4 +1,4 @@ -import { unlinkSync, createWriteStream, readFileSync, rename } from 'fs'; +import { unlinkSync, createWriteStream, readFileSync, rename, writeFile } from 'fs'; import { Utils } from '../Utils'; import * as path from 'path'; import * as sharp from 'sharp'; @@ -127,9 +127,12 @@ export namespace DashUploadUtils { * 3) the size of the image, in bytes (4432130) * 4) the content type of the image, i.e. image/(jpeg | png | ...) */ - export const UploadImage = async (source: string, filename?: string, format?: string, prefix: string = ""): Promise => { + export const UploadImage = async (source: string, filename?: string, format?: string, prefix: string = ""): Promise => { const metadata = await InspectImage(source); - return UploadInspectedImage(metadata, filename, format, prefix); + if (metadata instanceof Error) { + return metadata; + } + return UploadInspectedImage(metadata, filename || metadata.filename, format, prefix); }; export interface InspectionResults { @@ -140,6 +143,7 @@ export namespace DashUploadUtils { contentType: string; nativeWidth: number; nativeHeight: number; + filename?: string; } export interface EnrichedExifData { @@ -164,7 +168,20 @@ export namespace DashUploadUtils { * * @param source is the path or url to the image in question */ - export const InspectImage = async (source: string): Promise => { + export const InspectImage = async (source: string): Promise => { + let rawMatches: RegExpExecArray | null; + let filename: string | undefined; + if ((rawMatches = /^data:image\/([a-z]+);base64,(.*)/.exec(source)) !== null) { + const [ext, data] = rawMatches.slice(1, 3); + const resolved = filename = `upload_${Utils.GenerateGuid()}.${ext}`; + const error = await new Promise(resolve => { + writeFile(serverPathToFile(Directory.images, resolved), data, "base64", resolve); + }); + if (error !== null) { + return error; + } + source = `http://localhost:1050${clientPathToFile(Directory.images, resolved)}`; + } let resolvedUrl: string; const matches = isLocal().exec(source); if (matches === null) { @@ -187,6 +204,7 @@ export namespace DashUploadUtils { contentType: headers[type], nativeWidth, nativeHeight, + filename, ...results }; }; -- cgit v1.2.3-70-g09d2