Unstructured API services
Getting started with API services
Process individual files
Batch processing and ingestion
- Overview
- Ingest CLI
- Ingest Python library
- Ingest dependencies
- Ingest configuration
- Source connectors
- Destination connectors
How to
- Choose a partitioning strategy
- Choose a hi-res model
- Get element contents
- Process a subset of files
- Set embedding behavior
- Parse simple PDFs and HTML
- Set partitioning behavior
- Set chunking behavior
- Output unique element IDs
- Output bounding box coordinates
- Set document language for better OCR
- Extract tables as HTML
- Extract images and tables from documents
- Get chunked elements
- Change element coordinate systems
- Work with PowerPoint files
- Use LangChain and Ollama
- Use LangChain and Llama 3
- Transform a JSON file into a different schema
- Generate a JSON schema for a file
Troubleshooting
Endpoints
Extract tables as HTML
Task
You want to get, save, or show the contents of elements that are represented as HTML, such as tables that are embedded in a PDF document.
Approach
Extract the contents of an element’s text_as_html
JSON object, which is nested inside of its parent metadata
object.
To run this example
You will need a document that is one of the document types that can output the text_as_html
JSON object. For the list of applicable document types, see the entries in the table at the beginning of Partitioning where “Table Support” is “Yes.”
This example uses a PDF file with an embedded table.
Code
For the Unstructured Ingest Python library, you can use the standard Python json.load function to load into a Python dictionary the contents of a JSON file that the Ingest Python library outputs after the processing is complete.
import json, os, webbrowser
def get_tables_as_html(
input_json_file_path: str,
output_html_dir_path: str
):
with open(input_json_file_path, 'r') as file:
file_elements = json.load(file)
# Provide some minimal CSS for better table readability.
table_css = "<head><style>table, th, td { border: 1px solid; }</style></head>"
for element in file_elements:
if "text_as_html" in element["metadata"]:
# Surround the element's HTML with basic <html> and <body> tags, and add the minimal CSS.
html_string = f"<!DOCTYPE html><html>{table_css}<body>{element["metadata"]["text_as_html"]}</body></html>"
# Save the element's HTML to a local file.
save_path = f"{output_html_dir_path}/{element["element_id"]}.html"
file = open(save_path, 'w')
file.write(html_string)
file.close()
# View the locally saved file in the local default web browser.
webbrowser.open_new(f"file:///{os.getcwd()}/{save_path}")
if __name__ == "__main__":
# Source: https://github.com/Unstructured-IO/unstructured-ingest/blob/main/example-docs/pdf/embedded-images-tables.pdf
# Specify where to get the local file, relative to this .py file, and
# where to store the retrieved HTML, relative to this .py file.
get_tables_as_html(
input_json_file_path="local-ingest-output/embedded-images-tables.json",
output_html_dir_path="local-ingest-output/html/"
)
For the Unstructured Python SDK, you’ll need:
These environment variables:
UNSTRUCTURED_API_KEY
- Your Unstructured API key value.UNSTRUCTURED_API_URL
- Your Unstructured API URL.
from unstructured_client import UnstructuredClient
from unstructured_client.models import operations, shared
from unstructured.staging.base import elements_from_dicts, elements_to_json
import os, webbrowser
if __name__ == "__main__":
client = UnstructuredClient(
api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"),
server_url=os.getenv("UNSTRUCTURED_API_URL")
)
# Source: https://github.com/Unstructured-IO/unstructured-ingest/blob/main/example-docs/pdf/embedded-images-tables.pdf
# Where to get the local file, relative to this .py file.
local_input_filepath = "local-ingest-input-pdf/embedded-images-tables.pdf"
# Where to store the retrieved HTML (and the processed JSON), relative to this .py file.
local_output_filepath = "local-ingest-output"
with open(local_input_filepath, "rb") as f:
files = shared.Files(
content=f.read(),
file_name=local_input_filepath
)
request = operations.PartitionRequest(
shared.PartitionParameters(
files=files,
strategy=shared.Strategy.HI_RES,
split_pdf_page=True,
split_pdf_allow_failed=True,
split_pdf_concurrency_level=15
)
)
try:
result = await client.general.partition_async(request)
# Provide some minimal CSS for better table readability.
table_css = "<head><style>table, th, td { border: 1px solid; }</style></head>"
for element in result.elements:
if "text_as_html" in element["metadata"]:
# Surround the element's HTML with basic <html> and <body> tags, and add the minimal CSS.
html_string = f"<!DOCTYPE html><html>{table_css}<body>{element["metadata"]["text_as_html"]}</body></html>"
# Save the element's HTML to a local file.
save_path = f"{local_output_path}/{element["element_id"]}.html"
file = open(save_path, 'w')
file.write(html_string)
file.close()
# View the locally saved file in the local default web browser.
webbrowser.open_new(f"file:///{os.getcwd()}/{save_path}")
# Also get the elements for inspection and validation.
dict_elements = elements_from_dicts(
element_dicts=result.elements
)
# Save the elements as JSON.
elements_to_json(
elements=dict_elements,
indent=2,
filename=f"{local_output_filepath}/embedded-images-tables.json"
)
except Exception as e:
print(e)