from unstructured import UnstructuredClient
client = UnstructuredClient(
username="your-username",
password="your-password",
)
# ============================================
# Step 1: Set up connectors
# ============================================
# Source: where documents come from
source = client.data_source.create(
connector_name="incoming-documents",
connector_body={
"vector_db_type": "s3",
"bucket_name": "incoming-docs",
"aws_access_key_id": "...",
"aws_secret_access_key": "...",
"region": "us-east-1",
},
)
# Destination: where clean documents go
clean_dest = client.destination.create(
destination_name="clean-archive",
destination_body={
"vector_db_type": "s3",
"bucket_name": "clean-documents",
"aws_access_key_id": "...",
"aws_secret_access_key": "...",
"region": "us-east-1",
},
)
# Destination: where sensitive documents go for review
sensitive_dest = client.destination.create(
destination_name="sensitive-review",
destination_body={
"vector_db_type": "s3",
"bucket_name": "sensitive-documents",
"aws_access_key_id": "...",
"aws_secret_access_key": "...",
"region": "us-east-1",
},
)
# ============================================
# Step 2: Create PII taxonomy
# ============================================
taxonomy = client.taxonomy.upsert(
taxonomy_name="compliance-pii",
taxonomy_description="Comprehensive PII detection for compliance",
tags=[
{
"name": "has_pii",
"description": "Whether document contains any PII",
"output_type": "boolean",
},
{
"name": "pii_categories",
"description": "Categories: financial, health, identity, contact",
"output_type": "list[string]",
},
{
"name": "risk_score",
"description": "Risk score from 1-10 based on PII sensitivity",
"output_type": "integer",
},
],
)
# ============================================
# Step 3: Process and classify
# ============================================
results = client.classify.generate_batch(
connector_name="incoming-documents",
taxonomy_name="compliance-pii",
)
# ============================================
# Step 4: Create slices for routing
# ============================================
# Slice for clean documents (no PII or low risk)
clean_slice = client.dataslice.create(
dataslice_name="clean-documents",
connector_name="incoming-documents",
conditions=[
{"field": "has_pii", "operator": "eq", "value": False},
],
)
# Slice for sensitive documents (high risk)
sensitive_slice = client.dataslice.create(
dataslice_name="sensitive-documents",
connector_name="incoming-documents",
conditions=[
{"field": "risk_score", "operator": "gte", "value": 7},
],
)
# ============================================
# Step 5: Export to appropriate destinations
# ============================================
# Send clean docs to archive
client.destination.export(
destination_name="clean-archive",
dataslice_name="clean-documents",
export_level="file",
export_metadata=True,
)
# Send sensitive docs for review
client.destination.export(
destination_name="sensitive-review",
dataslice_name="sensitive-documents",
export_level="file",
export_metadata=True,
)
print("✓ Documents routed based on PII content")