Example: Analytics on Enron Email
Goals
In this example, we will walkthrough how you can use our Python SDK to extract key features from thousands of emails in seconds. We'll be using a few Enron emails as our data source.
You can see the code end-to-end here.
Get your API key
First, go to https://dashboard.runtrellis.com/sign-up and create your account. Then, click settings
on the lower right or visit the setting page and copy your API key.
Install the Trellis Python SDK
pip install trellis-python
Upload and process your data:
In this steps, upload the files and process them
import time
from trellis.client import TrellisApi
from trellis import TransformationOperation, TransformParams
YOUR_API_KEY = "YOUR_API_KEY" # add your api key
YOUR_PROJ_NAME = "YOUR_PROJECT_NAME" # make sure to use snake case
client = TrellisApi(api_key=YOUR_API_KEY)
# Upload files with URLs
client.assets.upload_assets(
proj_name=YOUR_PROJ_NAME,
# You can pass in urls or files. Here, we pass in presigned S3 URLs
urls=[
"https://trellis-ai-public.s3.us-west-2.amazonaws.com/enron_clay_johnson_email.txt",
"https://trellis-ai-public.s3.us-west-2.amazonaws.com/enron_memorial_day_plan.txt",
"https://trellis-ai-public.s3.us-west-2.amazonaws.com/enron_mx_secretary_energy.txt"
]
)
# Extract files based on project
client.assets_extract.extract_files(
proj_name=YOUR_PROJ_NAME
)
Define the transformation you want
In this example, we want to extract who the email is from, the topic of the email based on our defined taxonomy, people mentioned, and whether this is a compliance risk.
# Define the transformation operations
transformation_operations = [
TransformationOperation(
column_name="email_from",
column_type="text",
transform_type="extraction",
task_description="extract who sent the email. This should be in From"
),
TransformationOperation(
column_name="email_to",
column_type="text[]",
transform_type="extraction",
task_description="Extract a list of emails in the To section"
),
TransformationOperation(
column_name="people_mentioned",
column_type="text[]",
transform_type="extraction",
task_description="Extract a list of people mentioned in the email. Return empty list if no one is being mentioned."
),
TransformationOperation(
column_name="compliance_risk",
column_type="text",
transform_type="classification",
task_description="Classify whether the email contains information that's potential compliance violation",
output_values={
"No": "the email does not potential compliance violation",
"Yes": "the email contains potential compliance violation"
}
),
TransformationOperation(
column_name="one_line_summary",
column_type="text",
transform_type="generation",
task_description="Summarize the email in one line"
),
TransformationOperation(
column_name="genre",
column_type="text",
transform_type="classification",
task_description="Classify the genre of the emails.",
output_values={
"employment": "topics related to job seeking, hiring, recommendations, etc",
"empty_message": "no information in the text",
"document_review": "collaborating on document, editing",
"purely_personal": "personal chat unrelated to work",
"company_business": "related to company business",
"logistics_arrangement": "meeting scheduling, technical support, etc",
"personal_professional": "Personal but in professional context (e.g., it was good working with you"
}
),
TransformationOperation(
column_name="primary_topics",
column_type="text",
transform_type="classification",
task_description="Classify the specific topics of conversation",
output_values={
"legal": "Topics around legal advice or involve legal matters",
"other": "Other topics not include in the existing categories",
"political": "Topics related political influence / contributions / contacts",
"regulation": "Topics around regulations and regulators (includes price caps)",
"company_image": "Topics around company image",
"energy_crisis": "Topics related to california energy crisis / california politics",
"internal_project": "Topics around internal projects -- progress and strategy",
"internal_operations": "Topics around Internal operations"
}
),
TransformationOperation(
column_name="emotional_tone",
column_type="text",
transform_type="classification",
task_description="Classify the tone and intent of the message.",
output_values={
"anger": "The email has angry, aggresive or agitate tone.",
"humor": "The email is funny or has humorous tone",
"secret": "The email has secrecy / confidentiality tone or contains confidential information.",
"concern": "The email seems concern, worry or anxious",
"neutral": "The email is neutral",
"gratitude": "The email has gratitude or admiration tone"
}
),
]
# Define the transformation parameters
transform_params = TransformParams(
model="trellis-premium",
operations=transformation_operations
)
Run the transformations
Next, we want to run the transformation, which runs as a job in the background. You can pass in a call back url or poll the results, which we do.
# Create the transformation
transform_response = client.transforms.create_transform(
proj_name=YOUR_PROJ_NAME,
transform_params=transform_params
)
# Get the results -- you can pass in callback url in the create transform step or wait for it to finish
transform_id = transform_response.data.transform_id
while True:
transform_result = client.transforms.get_transform_results(transform_id=transform_id)
status = transform_result.transform_status
print(f"Current status: {status}")
if status == "completed":
print("Transform completed.")
print(transform_result.data)
break
time.sleep(1) # Wait for 1 seconds before checking the status again
Get results and visualize
import pandas as pd
from tabulate import tabulate
df = pd.DataFrame(transform_result.data)
# Display the DataFrame in a tabular format
tabulated_str = tabulate(df, headers='keys', tablefmt='pretty', stralign='left', numalign='left')
print(tabulated_str)
Updated about 2 months ago