aboutsummaryrefslogtreecommitdiff
path: root/src/server/chunker/pdf_chunker.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/server/chunker/pdf_chunker.py')
-rw-r--r--src/server/chunker/pdf_chunker.py13
1 files changed, 7 insertions, 6 deletions
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index e9b9ef2b3..e34753176 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -622,7 +622,7 @@ class Document:
Represents a document being processed, such as a PDF, handling chunking, embedding, and summarization.
"""
- def __init__(self, file_path: str, file_name: str, job_id: str, output_folder: str):
+ def __init__(self, file_path: str, file_name: str, job_id: str, output_folder: str, doc_id: str):
"""
Initialize the Document with file data, file name, and job ID.
@@ -635,7 +635,7 @@ class Document:
self.file_path = file_path
self.job_id = job_id
self.type = self._get_document_type(file_name) # Determine the document type (PDF, CSV, etc.)
- self.doc_id = job_id # Use the job ID as the document ID
+ self.doc_id = doc_id # Use the job ID as the document ID
self.chunks = [] # List to hold text and visual chunks
self.num_pages = 0 # Number of pages in the document (if applicable)
self.summary = "" # The generated summary for the document
@@ -755,7 +755,7 @@ class Document:
"doc_id": self.doc_id
}, indent=2) # Convert the document's attributes to JSON format
-def process_document(file_path, job_id, output_folder):
+def process_document(file_path, job_id, output_folder, doc_id):
"""
Top-level function to process a document and return the JSON output.
@@ -763,26 +763,27 @@ def process_document(file_path, job_id, output_folder):
:param job_id: The job ID for this document processing task.
:return: The processed document's data in JSON format.
"""
- new_document = Document(file_path, file_path, job_id, output_folder)
+ new_document = Document(file_path, file_path, job_id, output_folder, doc_id)
return new_document.to_json()
def main():
"""
Main entry point for the script, called with arguments from Node.js.
"""
- if len(sys.argv) != 4:
+ if len(sys.argv) != 5:
print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr)
return
job_id = sys.argv[1]
file_path = sys.argv[2]
output_folder = sys.argv[3] # Get the output folder from arguments
+ doc_id = sys.argv[4]
try:
os.makedirs(output_folder, exist_ok=True)
# Process the document
- document_result = process_document(file_path, job_id, output_folder) # Pass output_folder
+ document_result = process_document(file_path, job_id, output_folder,doc_id) # Pass output_folder
# Output the final result as JSON to stdout
print(document_result)