document_processor
Process documents by applying XFDF annotations, flattening, OCR, page rotation, watermarking, and redaction using the Nutrient DWS Processor API. Supports PDF, PDF/A, images, JSON extraction, and Office formats.
Instructions
Processes documents using Nutrient DWS Processor API. Reads from and writes to file system or sandbox (if enabled).
Features: • Import XFDF annotations • Flatten annotations • OCR processing • Page rotation • Watermarking (text/image) • Redaction creation and application
Output formats: PDF, PDF/A, images (PNG, JPEG, WebP), JSON extraction, Office (DOCX, XLSX, PPTX)
Input Schema
Name | Required | Description | Default |
---|---|---|---|
instructions | Yes | Build instructions. | |
outputPath | Yes | A path to the output file to. (if required) Resolves to sandbox path if enabled, otherwise resolves to the local file system. |
Input Schema (JSON Schema)
{
"$schema": "http://json-schema.org/draft-07/schema#",
"additionalProperties": false,
"properties": {
"instructions": {
"additionalProperties": false,
"description": "Build instructions.",
"properties": {
"actions": {
"description": "Actions to be performed on the document after it is built from the parts.",
"items": {
"anyOf": [
{
"additionalProperties": false,
"properties": {
"file": {
"description": "The path to the XFDF file or a reference to a file in the multipart request. Resolves to sandbox path if enabled, otherwise resolves to the local file system.",
"type": "string"
},
"type": {
"const": "applyXfdf",
"description": "Apply the XFDF to the document to import annotations to a document.",
"type": "string"
}
},
"required": [
"type",
"file"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"type": {
"const": "flatten",
"description": "Flatten the annotations in the document.",
"type": "string"
}
},
"required": [
"type"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"language": {
"description": "Language to be used for the OCR text extraction.",
"type": "string"
},
"type": {
"const": "ocr",
"description": "Perform optical character recognition (OCR) in the document.",
"type": "string"
}
},
"required": [
"type",
"language"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"rotateBy": {
"description": "The angle by which the pages should be rotated, clockwise.",
"type": "number"
},
"type": {
"const": "rotate",
"description": "Rotate all pages by the angle specified.",
"type": "string"
}
},
"required": [
"type",
"rotateBy"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"fontColor": {
"description": "A hex color of the watermark text. ^#[0-9a-fA-F]{6}$",
"type": "string"
},
"height": {
"anyOf": [
{
"$ref": "#/properties/instructions/properties/actions/items/anyOf/4/properties/width/anyOf/0"
},
{
"$ref": "#/properties/instructions/properties/actions/items/anyOf/4/properties/width/anyOf/1"
}
],
"description": "Height of the watermark."
},
"image": {
"description": "For image watermarks, the path to the image file or a reference to a file in the multipart request. Resolves to sandbox path if enabled, otherwise resolves to the local file system.",
"type": "string"
},
"opacity": {
"description": "Watermark opacity. 0 is fully transparent, 1 is fully opaque. 0.7 is a optimal value.",
"maximum": 1,
"minimum": 0,
"type": "number"
},
"rotation": {
"default": 0,
"description": "Rotation of the watermark in counterclockwise degrees.",
"type": "number"
},
"text": {
"description": "Text used for watermarking",
"type": "string"
},
"type": {
"const": "watermark",
"description": "Watermark action.",
"type": "string"
},
"watermarkType": {
"description": "Type of the watermark.",
"enum": [
"text",
"image"
],
"type": "string"
},
"width": {
"anyOf": [
{
"description": "Value in points",
"type": "number"
},
{
"description": "Percentage value",
"pattern": "^\\d+%$",
"type": "string"
}
],
"description": "Width of the watermark."
}
},
"required": [
"type",
"watermarkType",
"width",
"height"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"strategy": {
"description": "The strategy to use for creating redactions.",
"enum": [
"preset",
"regex",
"text"
],
"type": "string"
},
"strategyOptions": {
"anyOf": [
{
"additionalProperties": false,
"properties": {
"includeAnnotations": {
"default": true,
"description": "Determines if redaction annotations are created on top of annotations whose content match the provided preset.",
"type": "boolean"
},
"limit": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Starting from start, the number of pages to search. Default is to the end of the document."
},
"preset": {
"description": "\n - credit-card-number — matches a number with 13 to 19 digits that begins with 1—6.\n Spaces and - are allowed anywhere in the number.\n - date — matches date formats such as mm/dd/yyyy, mm/dd/yy, dd/mm/yyyy, and dd/mm/yy.\n It rejects any days greater than 31 or months greater than 12 and accepts a leading 0 in front of a single-digit day or month.\n The delimiter can be -, ., or /.\n - email-address — matches an email address. Expects the format of *@*.* with at least two levels of the domain name.\n - international-phone-number — matches international phone numbers.\n The number can have 7 to 15 digits with spaces or - occurring anywhere within the number, and it must have prefix of + or 00.\n - ipv4 — matches an IPv4 address with an optional mask at the end.\n - ipv6 — matches a full and compressed IPv6 address as defined in RFC 2373.\n - mac-address — matches a MAC address with either - or : as a delimiter.\n - north-american-phone-number — matches North American-style phone numbers.\n NANPA standardization is used with international support.\n - social-security-number — matches a social security number.\n Expects the format of XXX-XX-XXXX or XXXXXXXXX, with X denoting digits.\n - time — matches time formats such as 00:00:00, 00:00, and 00:00 PM. 12- and 24-hour formats are allowed.\n Seconds and AM/PM denotation are both optional.\n - url — matches a URL with a prefix of http or https, with an optional subdomain.\n - us-zip-code — matches a USA-style zip code. The format expected is XXXXX, XXXXX-XXXX or XXXXX/XXXX.\n - vin — matches US and ISO Standard 3779 Vehicle Identification Number. \n The format expects 17 characters, with the last 5 characters being numeric. I, i, O, o, Q, q, and _ characters are not allowed.\n",
"enum": [
"credit-card-number",
"date",
"email-address",
"international-phone-number",
"ipv4",
"ipv6",
"mac-address",
"north-american-phone-number",
"social-security-number",
"time",
"url",
"us-zip-code",
"vin"
],
"type": "string"
},
"start": {
"default": 0,
"description": "The index of the page from where you want to start the search.",
"type": "integer"
}
},
"required": [
"preset"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"caseSensitive": {
"default": true,
"description": "Determines if the search will be case sensitive.",
"type": "boolean"
},
"includeAnnotations": {
"$ref": "#/properties/instructions/properties/actions/items/anyOf/5/properties/strategyOptions/anyOf/0/properties/includeAnnotations"
},
"limit": {
"$ref": "#/properties/instructions/properties/actions/items/anyOf/5/properties/strategyOptions/anyOf/0/properties/limit"
},
"regex": {
"description": "Regex search term used for searching for text to redact.",
"type": "string"
},
"start": {
"$ref": "#/properties/instructions/properties/actions/items/anyOf/5/properties/strategyOptions/anyOf/0/properties/start"
}
},
"required": [
"regex"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"caseSensitive": {
"default": false,
"description": "Determines if the search will be case sensitive.",
"type": "boolean"
},
"includeAnnotations": {
"$ref": "#/properties/instructions/properties/actions/items/anyOf/5/properties/strategyOptions/anyOf/0/properties/includeAnnotations"
},
"limit": {
"$ref": "#/properties/instructions/properties/actions/items/anyOf/5/properties/strategyOptions/anyOf/0/properties/limit"
},
"start": {
"$ref": "#/properties/instructions/properties/actions/items/anyOf/5/properties/strategyOptions/anyOf/0/properties/start"
},
"text": {
"description": "Search term used for searching for text to redact.",
"type": "string"
}
},
"required": [
"text"
],
"type": "object"
}
],
"description": "Options for the selected strategy."
},
"type": {
"const": "createRedactions",
"description": "Creates redactions according to the given strategy. Once redactions are created, they need to be applied using the applyRedactions action.",
"type": "string"
}
},
"required": [
"type",
"strategy",
"strategyOptions"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"type": {
"const": "applyRedactions",
"description": "Applies the redactions created by an earlier createRedactions action.",
"type": "string"
}
},
"required": [
"type"
],
"type": "object"
}
]
},
"type": "array"
},
"output": {
"anyOf": [
{
"additionalProperties": false,
"properties": {
"labels": {
"description": "Page labels.",
"items": {
"additionalProperties": false,
"properties": {
"label": {
"description": "The label to apply to specified pages.",
"type": "string"
},
"pages": {
"additionalProperties": false,
"description": "Page range to apply the label to (0-based indexing).",
"properties": {
"end": {
"$ref": "#/properties/instructions/properties/parts/items/properties/pages/properties/end"
},
"start": {
"$ref": "#/properties/instructions/properties/parts/items/properties/pages/properties/start"
}
},
"type": "object"
}
},
"required": [
"pages",
"label"
],
"type": "object"
},
"type": "array"
},
"metadata": {
"additionalProperties": false,
"description": "Document metadata.",
"properties": {
"author": {
"description": "The document author.",
"type": "string"
},
"title": {
"description": "The document title.",
"type": "string"
}
},
"type": "object"
},
"optimize": {
"additionalProperties": false,
"description": "PDF optimization options.",
"properties": {
"disableImages": {
"default": false,
"description": "Disable images in the document.",
"type": "boolean"
},
"grayscaleAnnotations": {
"default": false,
"description": "Convert annotations to grayscale.",
"type": "boolean"
},
"grayscaleFormFields": {
"default": false,
"description": "Convert form fields to grayscale.",
"type": "boolean"
},
"grayscaleGraphics": {
"default": false,
"description": "Convert graphics to grayscale.",
"type": "boolean"
},
"grayscaleImages": {
"default": false,
"description": "Convert images to grayscale.",
"type": "boolean"
},
"grayscaleText": {
"default": false,
"description": "Convert text to grayscale.",
"type": "boolean"
},
"imageOptimizationQuality": {
"default": 2,
"description": "Image optimization quality.",
"maximum": 4,
"minimum": 1,
"type": "number"
},
"linearize": {
"default": false,
"description": "Linearize the PDF for faster loading over the network.",
"type": "boolean"
},
"mrcCompression": {
"default": false,
"description": "Use MRC compression.",
"type": "boolean"
}
},
"type": "object"
},
"owner_password": {
"description": "Password required to modify the document.",
"type": "string"
},
"type": {
"const": "pdf",
"description": "Output as standard PDF.",
"type": "string"
},
"user_password": {
"description": "Password required to open the document.",
"type": "string"
},
"user_permissions": {
"description": "Permissions granted when the document is opened with the user password.",
"items": {
"enum": [
"printing",
"modification",
"extract",
"annotations_and_forms",
"fill_forms",
"extract_accessibility",
"assemble",
"print_high_quality"
],
"type": "string"
},
"type": "array"
}
},
"required": [
"type"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"conformance": {
"description": "PDF/A conformance level.",
"enum": [
"pdfa-1a",
"pdfa-1b",
"pdfa-2a",
"pdfa-2u",
"pdfa-2b",
"pdfa-3a",
"pdfa-3u"
],
"type": "string"
},
"labels": {
"$ref": "#/properties/instructions/properties/output/anyOf/0/properties/labels"
},
"metadata": {
"$ref": "#/properties/instructions/properties/output/anyOf/0/properties/metadata"
},
"optimize": {
"$ref": "#/properties/instructions/properties/output/anyOf/0/properties/optimize"
},
"owner_password": {
"$ref": "#/properties/instructions/properties/output/anyOf/0/properties/owner_password"
},
"rasterization": {
"default": true,
"description": "Produce raster-based graphic elements where applicable.",
"type": "boolean"
},
"type": {
"const": "pdfa",
"description": "Output as PDF/A for archiving.",
"type": "string"
},
"user_password": {
"$ref": "#/properties/instructions/properties/output/anyOf/0/properties/user_password"
},
"user_permissions": {
"$ref": "#/properties/instructions/properties/output/anyOf/0/properties/user_permissions"
},
"vectorization": {
"default": true,
"description": "Produce vector-based graphic elements where applicable.",
"type": "boolean"
}
},
"required": [
"type"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"labels": {
"$ref": "#/properties/instructions/properties/output/anyOf/0/properties/labels"
},
"metadata": {
"$ref": "#/properties/instructions/properties/output/anyOf/0/properties/metadata"
},
"optimize": {
"$ref": "#/properties/instructions/properties/output/anyOf/0/properties/optimize"
},
"owner_password": {
"$ref": "#/properties/instructions/properties/output/anyOf/0/properties/owner_password"
},
"type": {
"const": "pdfua",
"description": "Output as PDF/UA for accessibility.",
"type": "string"
},
"user_password": {
"$ref": "#/properties/instructions/properties/output/anyOf/0/properties/user_password"
},
"user_permissions": {
"$ref": "#/properties/instructions/properties/output/anyOf/0/properties/user_permissions"
}
},
"required": [
"type"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"dpi": {
"description": "Resolution of the rendered image in dots per inch. Only one of width, height, or dpi can be defined.",
"type": "number"
},
"format": {
"default": "png",
"description": "Image format.",
"enum": [
"png",
"jpeg",
"jpg",
"webp"
],
"type": "string"
},
"height": {
"description": "Height of the rendered image in pixels. Only one of width, height, or dpi can be defined.",
"type": "number"
},
"pages": {
"$ref": "#/properties/instructions/properties/parts/items/properties/pages",
"description": "Page range to render (0-based indexing)."
},
"type": {
"const": "image",
"description": "Output as image.",
"type": "string"
},
"width": {
"description": "Width of the rendered image in pixels Only one of width, height, or dpi can be defined.",
"type": "number"
}
},
"required": [
"type"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"keyValuePairs": {
"default": false,
"description": "Extract key-value pairs detected within the document contents. Example of detected values are phone numbers, email addresses, currencies, numbers, dates, etc. Use one of `plainText`, `keyValuePairs`, or `tables`. at a time.",
"type": "boolean"
},
"language": {
"anyOf": [
{
"description": "Language for OCR text extraction.",
"type": "string"
},
{
"description": "Languages for OCR text extraction.",
"items": {
"type": "string"
},
"type": "array"
}
]
},
"plainText": {
"default": true,
"description": "Extract document text. Text is extracted via OCR process.",
"type": "boolean"
},
"tables": {
"default": true,
"description": "Extract tabular data from the document. Use one of `plainText`, `keyValuePairs`, or `tables`. at a time.",
"type": "boolean"
},
"type": {
"const": "json-content",
"description": "Output as JSON with document contents.",
"type": "string"
}
},
"required": [
"type"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"type": {
"description": "Output as Office document.",
"enum": [
"docx",
"xlsx",
"pptx"
],
"type": "string"
}
},
"required": [
"type"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"layout": {
"description": "The layout type to use for conversion to HTML. `page` layout keeps the original structure of the document, segmented by page. `reflow` layout converts the document into a continuous flow of text, without page breaks.",
"enum": [
"page",
"reflow"
],
"type": "string"
},
"type": {
"const": "html",
"description": "Output as HTML.",
"type": "string"
}
},
"required": [
"type"
],
"type": "object"
},
{
"additionalProperties": false,
"properties": {
"type": {
"const": "markdown",
"description": "Output as Markdown.",
"type": "string"
}
},
"required": [
"type"
],
"type": "object"
}
],
"description": "Output format configuration. Supports PDF, PDF/A, PDF/UA, image, JSON content, Office document formats, HTML, and Markdown."
},
"parts": {
"description": "Parts of the document to be built.",
"items": {
"additionalProperties": false,
"properties": {
"content_type": {
"description": "Used to determine the file type when the file content type is not available and can't be inferred.",
"type": "string"
},
"file": {
"description": "The path to the file to be processed. Resolves to sandbox path if enabled, otherwise resolves to the local file system.",
"type": "string"
},
"pages": {
"additionalProperties": false,
"description": "Page range to include from the file (0-based indexing).",
"properties": {
"end": {
"default": -1,
"description": "End page index (0-based). Default is -1 (last page). Negative values count from the end.",
"type": "integer"
},
"start": {
"default": 0,
"description": "Start page index (0-based). Default is 0 (first page).",
"type": "integer"
}
},
"type": "object"
},
"password": {
"description": "The password for the input file if it is password-protected.",
"type": "string"
}
},
"required": [
"file"
],
"type": "object"
},
"type": "array"
}
},
"required": [
"parts"
],
"type": "object"
},
"outputPath": {
"description": "A path to the output file to. (if required) Resolves to sandbox path if enabled, otherwise resolves to the local file system.",
"type": "string"
}
},
"required": [
"instructions",
"outputPath"
],
"type": "object"
}