diff options
author | A.J. Shulman <Shulman.aj@gmail.com> | 2025-05-11 13:42:00 -0400 |
---|---|---|
committer | A.J. Shulman <Shulman.aj@gmail.com> | 2025-05-11 13:42:00 -0400 |
commit | a5d7f5c38192b91b7df3bd6ecace5ba7365449a6 (patch) | |
tree | c6be94f983b5fcc65424b81d42ddb0718127404c /src/server/chunker/pdf_chunker.py | |
parent | 3c28aa3a706869d818bc8a089e8d1a53f7234bc0 (diff) |
Made it so chunk Ids are seperately managed and made sure the doc id is sonsistent and not created in python spawn
Diffstat (limited to 'src/server/chunker/pdf_chunker.py')
-rw-r--r-- | src/server/chunker/pdf_chunker.py | 13 |
1 files changed, 7 insertions, 6 deletions
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py index e9b9ef2b3..e34753176 100644 --- a/src/server/chunker/pdf_chunker.py +++ b/src/server/chunker/pdf_chunker.py @@ -622,7 +622,7 @@ class Document: Represents a document being processed, such as a PDF, handling chunking, embedding, and summarization. """ - def __init__(self, file_path: str, file_name: str, job_id: str, output_folder: str): + def __init__(self, file_path: str, file_name: str, job_id: str, output_folder: str, doc_id: str): """ Initialize the Document with file data, file name, and job ID. @@ -635,7 +635,7 @@ class Document: self.file_path = file_path self.job_id = job_id self.type = self._get_document_type(file_name) # Determine the document type (PDF, CSV, etc.) - self.doc_id = job_id # Use the job ID as the document ID + self.doc_id = doc_id # Use the job ID as the document ID self.chunks = [] # List to hold text and visual chunks self.num_pages = 0 # Number of pages in the document (if applicable) self.summary = "" # The generated summary for the document @@ -755,7 +755,7 @@ class Document: "doc_id": self.doc_id }, indent=2) # Convert the document's attributes to JSON format -def process_document(file_path, job_id, output_folder): +def process_document(file_path, job_id, output_folder, doc_id): """ Top-level function to process a document and return the JSON output. @@ -763,26 +763,27 @@ def process_document(file_path, job_id, output_folder): :param job_id: The job ID for this document processing task. :return: The processed document's data in JSON format. """ - new_document = Document(file_path, file_path, job_id, output_folder) + new_document = Document(file_path, file_path, job_id, output_folder, doc_id) return new_document.to_json() def main(): """ Main entry point for the script, called with arguments from Node.js. """ - if len(sys.argv) != 4: + if len(sys.argv) != 5: print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr) return job_id = sys.argv[1] file_path = sys.argv[2] output_folder = sys.argv[3] # Get the output folder from arguments + doc_id = sys.argv[4] try: os.makedirs(output_folder, exist_ok=True) # Process the document - document_result = process_document(file_path, job_id, output_folder) # Pass output_folder + document_result = process_document(file_path, job_id, output_folder,doc_id) # Pass output_folder # Output the final result as JSON to stdout print(document_result) |