Skip to content

Commit efb1eb6

Browse files
Merge pull request #122 from datalogics-cgreen/pdfcloud-5252-markdown-complex-sample
Add PDF to Markdown Python sample
2 parents ca3faf1 + d991604 commit efb1eb6

File tree

1 file changed

+133
-0
lines changed

1 file changed

+133
-0
lines changed
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
from requests_toolbelt import MultipartEncoder
2+
import requests
3+
import json
4+
5+
# This sample demonstrates a workflow to prepare form and image-only documents
6+
# for conversion to Markdown. This process uses the Query PDF tool to determine
7+
# whether the document contains forms or contains only images. Form documents
8+
# have their forms flattened, and image-only documents are processed through
9+
# the OCR PDF tool.
10+
#
11+
# To get started, configure the following constants.
12+
13+
# By default, we use the US-based API service. This is the primary endpoint for global use.
14+
API_URL = "https://api.pdfrest.com"
15+
16+
# For GDPR compliance and enhanced performance for European users, you can switch to the EU-based service by uncommenting the URL below.
17+
# For more information visit https://pdfrest.com/pricing#how-do-eu-gdpr-api-calls-work
18+
#API_URL = "https://eu-api.pdfrest.com"
19+
20+
API_KEY = 'xxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx' # Place your api key here.
21+
22+
OCR_LANGUAGES = "English" # a comma-separated list of languages for the OCR tool to use
23+
24+
INPUT_FILE_LOCATION = '/path/to/file/location/' # Replace this value with the home directory of your input PDF.
25+
26+
INPUT_FILE_NAME = 'myFile.pdf' # Replace this value with the filename of your input PDF.
27+
28+
DO_OCR = True # If True, this sample will perform OCR on PDFs that are image-only.
29+
30+
DO_FORM_FLATTENING = True # If True, this sample will flatten forms in form documents.
31+
32+
pdf_endpoint_url = f"{API_URL}/pdf-info"
33+
34+
mp_encoder_query = MultipartEncoder(
35+
fields={
36+
'file': (INPUT_FILE_NAME, open(f"{INPUT_FILE_LOCATION}{INPUT_FILE_NAME}", 'rb'), 'application/pdf'),
37+
'queries': "image_only,contains_xfa,contains_acroforms",
38+
}
39+
)
40+
41+
headers = {
42+
'Accept': 'application/json',
43+
'Content-Type': mp_encoder_query.content_type,
44+
'Api-Key': API_KEY
45+
}
46+
47+
print("Sending POST request to Query PDF endpoint...")
48+
response = requests.post(pdf_endpoint_url, data=mp_encoder_query, headers=headers)
49+
50+
print("Response status code: " + str(response.status_code))
51+
52+
if response.ok:
53+
query_response = response.json()
54+
if query_response["allQueriesProcessed"]:
55+
pdf_id = query_response["inputId"]
56+
57+
# NOTE For demo purposes
58+
is_starter_key = "message" in query_response
59+
if is_starter_key:
60+
pdf_contains_forms = query_response["contains_xfa"] == "tr**" or query_response["contains_acroforms"] == "tr**"
61+
image_only_pdf = query_response["image_only"] == "tr**"
62+
else:
63+
pdf_contains_forms = query_response["contains_xfa"] or query_response["contains_acroforms"]
64+
image_only_pdf = query_response["image_only"]
65+
66+
if pdf_contains_forms and DO_FORM_FLATTENING:
67+
mp_encoder_flatten = MultipartEncoder(
68+
fields={
69+
'id': pdf_id,
70+
}
71+
)
72+
headers = {
73+
'Accept': 'application/json',
74+
'Content-Type': mp_encoder_flatten.content_type,
75+
'Api-Key': API_KEY
76+
}
77+
flatten_endpoint_url = f"{API_URL}/flattened-forms-pdf"
78+
print("Sending POST request to Flatten Forms endpoint...")
79+
response = requests.post(flatten_endpoint_url, data=mp_encoder_flatten, headers=headers)
80+
if response.ok:
81+
flatten_response = response.json()
82+
pdf_id = flatten_response["outputId"]
83+
else:
84+
print(response.text)
85+
elif image_only_pdf and DO_OCR:
86+
mp_encoder_ocr = MultipartEncoder(
87+
fields={
88+
'id': pdf_id,
89+
'languages': OCR_LANGUAGES,
90+
}
91+
)
92+
headers = {
93+
'Accept': 'application/json',
94+
'Content-Type': mp_encoder_ocr.content_type,
95+
'Api-Key': API_KEY
96+
}
97+
ocr_endpoint_url = f"{API_URL}/pdf-with-ocr-text"
98+
print("Sending POST request to OCR PDF endpoint...")
99+
response = requests.post(ocr_endpoint_url, data=mp_encoder_ocr, headers=headers)
100+
if response.ok:
101+
ocr_response = response.json()
102+
pdf_id = ocr_response["outputId"]
103+
else:
104+
print(response.text)
105+
106+
mp_encoder_markdown = MultipartEncoder(
107+
fields={
108+
'id': pdf_id,
109+
'output_type': "file", # Set to 'file' to get a download link to the .MD file.
110+
}
111+
)
112+
113+
headers = {
114+
'Accept': 'application/json',
115+
'Content-Type': mp_encoder_markdown.content_type,
116+
'Api-Key': API_KEY
117+
}
118+
119+
print("Sending POST request to Markdown endpoint...")
120+
markdown_endpoint_url = f"{API_URL}/markdown"
121+
response = requests.post(markdown_endpoint_url, data=mp_encoder_markdown, headers=headers)
122+
123+
print("Response status code: " + str(response.status_code))
124+
125+
if response.ok:
126+
query_response = response.json()
127+
print(json.dumps(query_response, indent = 2))
128+
else:
129+
print(response.text)
130+
else:
131+
print(response.text)
132+
else:
133+
print(response.text)

0 commit comments

Comments
 (0)