Made it so chunk Ids are seperately managed and made sure the doc id is sonsistent and not created in python spawn

author: A.J. Shulman <Shulman.aj@gmail.com> 2025-05-11 13:42:00 -0400
committer: A.J. Shulman <Shulman.aj@gmail.com> 2025-05-11 13:42:00 -0400
commit: a5d7f5c38192b91b7df3bd6ecace5ba7365449a6 (patch)
tree: c6be94f983b5fcc65424b81d42ddb0718127404c /src/server/chunker/pdf_chunker.py
parent: 3c28aa3a706869d818bc8a089e8d1a53f7234bc0 (diff)
1 files changed, 7 insertions, 6 deletions
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index e9b9ef2b3..e34753176 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -622,7 +622,7 @@ class Document:
     Represents a document being processed, such as a PDF, handling chunking, embedding, and summarization.
     """
 
-    def __init__(self, file_path: str, file_name: str, job_id: str, output_folder: str):
+    def __init__(self, file_path: str, file_name: str, job_id: str, output_folder: str, doc_id: str):
         """
         Initialize the Document with file data, file name, and job ID.
 
@@ -635,7 +635,7 @@ class Document:
         self.file_path = file_path
         self.job_id = job_id
         self.type = self._get_document_type(file_name)  # Determine the document type (PDF, CSV, etc.)
-        self.doc_id = job_id  # Use the job ID as the document ID
+        self.doc_id = doc_id  # Use the job ID as the document ID
         self.chunks = []  # List to hold text and visual chunks
         self.num_pages = 0  # Number of pages in the document (if applicable)
         self.summary = ""  # The generated summary for the document
@@ -755,7 +755,7 @@ class Document:
             "doc_id": self.doc_id
         }, indent=2)  # Convert the document's attributes to JSON format
 
-def process_document(file_path, job_id, output_folder):
+def process_document(file_path, job_id, output_folder, doc_id):
     """
     Top-level function to process a document and return the JSON output.
 
@@ -763,26 +763,27 @@ def process_document(file_path, job_id, output_folder):
     :param job_id: The job ID for this document processing task.
     :return: The processed document's data in JSON format.
     """
-    new_document = Document(file_path, file_path, job_id, output_folder)
+    new_document = Document(file_path, file_path, job_id, output_folder, doc_id)
     return new_document.to_json()
 
 def main():
     """
     Main entry point for the script, called with arguments from Node.js.
     """
-    if len(sys.argv) != 4:
+    if len(sys.argv) != 5:
         print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr)
         return
 
     job_id = sys.argv[1]
     file_path = sys.argv[2]
     output_folder = sys.argv[3]  # Get the output folder from arguments
+    doc_id = sys.argv[4]
 
     try:
         os.makedirs(output_folder, exist_ok=True)
         
         # Process the document
-        document_result = process_document(file_path, job_id, output_folder)  # Pass output_folder
+        document_result = process_document(file_path, job_id, output_folder,doc_id)  # Pass output_folder
 
         # Output the final result as JSON to stdout
         print(document_result)
author	A.J. Shulman <Shulman.aj@gmail.com>	2025-05-11 13:42:00 -0400
committer	A.J. Shulman <Shulman.aj@gmail.com>	2025-05-11 13:42:00 -0400
commit	a5d7f5c38192b91b7df3bd6ecace5ba7365449a6 (patch)
tree	c6be94f983b5fcc65424b81d42ddb0718127404c /src/server/chunker/pdf_chunker.py
parent	3c28aa3a706869d818bc8a089e8d1a53f7234bc0 (diff)