Generative AI Custom Evaluation
This is an example notebook which showcases how a user can use Evaluations SDK to benchmark their large language models, evaluate orchestration configuration or prompts for their use case. It uses publicly available MedicationQA dataset which consists of commonly asked consumer questions about medications. The workload computes industry standard metrics to check the reliability of the response generate by llm.
Setup
# Loading the credentials from the env file
from gen_ai_hub.evaluations.client import EvaluationClient
from dotenv import load_dotenv
import os
load_dotenv(override=True)
AICORE_BASE_URL = os.getenv("AICORE_BASE_URL")
AICORE_RESOURCE_GROUP = os.getenv("AICORE_RESOURCE_GROUP")
AICORE_AUTH_URL = os.getenv("AICORE_AUTH_URL")
AICORE_CLIENT_ID = os.getenv("AICORE_CLIENT_ID")
AICORE_CLIENT_SECRET = os.getenv("AICORE_CLIENT_SECRET")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_BUCKET_ID = os.getenv("AWS_BUCKET_ID")
AWS_REGION = os.getenv("AWS_REGION")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
ORCHESTRATION_URL = os.getenv("ORCHESTRATION_URL")
client = EvaluationClient(
# direct ai_core_client can be added as a parameter if already created
base_url=AICORE_BASE_URL,
auth_url=AICORE_AUTH_URL,
client_id=AICORE_CLIENT_ID,
client_secret=AICORE_CLIENT_SECRET,
resource_group=AICORE_RESOURCE_GROUP,
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
orchestration_url=ORCHESTRATION_URL
)
# One more way to initialize the client
# client = EvaluationClient.from_env()
print(client.base_url)
print(client.ai_core_client.object_store_secrets.query(top=10,resource_group=AICORE_RESOURCE_GROUP))
Note: If the user is using the default resource group, an orchestration URL is already available and will be selected automatically. If the user has previously created object store secrets, this setup step can be skipped.
One Time Creation of Secrets
import json
AWS_S3_ENDPOINT = "s3-eu-central-1.amazonaws.com"
# default secret is needed to store output artifacts that the evaluation job creates after it is completed
default_secret_creds = {
"data": {},
"type": "S3",
"pathPrefix": "sdkOutputFiles",
"endpoint": AWS_S3_ENDPOINT,
"bucket": AWS_BUCKET_ID,
"region": AWS_REGION,
"usehttps": "1",
}
# input secret is used to load input artifacts required by the evaluation job.
# This is optional as these files can be loaded via default secret path as well.
input_secret_creds = {
"data": {},
"name": "sdk-data",
"type": "S3",
"pathPrefix": "sdk_input_files/data",
"endpoint": AWS_S3_ENDPOINT,
"bucket": AWS_BUCKET_ID,
"region": AWS_REGION,
"usehttps": "1",
}
# Function Scope:
# Creation of object store secrets and creates orchestration deployment url if not passed via initialization.
response = client.setup(
default_secret_body=default_secret_creds, input_secret_body=input_secret_creds, replace_existing=True
)
print(json.dumps(response, indent=4, ensure_ascii=False))
Helper function to list available models in the region
models_list = client.list_available_models()
model_names = [m["model"] for m in models_list]
print(model_names)
Helper function to see available System Defined Metrics
metrics_list = client.get_system_supported_metrics() # Fetches metrics info from Metric Management Service.
metric_names = [m["name"] for m in metrics_list]
print(metric_names)
Defining the Evaluation Config
To know more about the structure and type of the parameters in any of the functions, one can just do help on it and see the docstring. For example this is for EvaluationConfig
from gen_ai_hub.evaluations.models.evaluation_config import EvaluationConfig
print(help(EvaluationConfig))
Helper function to generate Evaluation Config
There are multiple ways to create EvaluationConfig. With prompt template (inline or reference) or orchestration registry reference, with multiple metric configurations in each EvaluationConfig.
from gen_ai_hub.prompt_registry.models.prompt_template import (
PromptTemplateSpec,
PromptTemplate,
)
from gen_ai_hub.orchestration.models.template_ref import TemplateRef
from gen_ai_hub.evaluations.models.dataset_config import Dataset
from gen_ai_hub.evaluations.models.metric_config import MetricConfig, MetricRef
from gen_ai_hub.orchestration.models.llm import LLM
from gen_ai_hub.evaluations.models.evaluation_config import EvaluationConfig
evaluation_config_list = [
EvaluationConfig(
llm=LLM(name="gpt-4o", version="latest"),
template=PromptTemplateSpec(
template=[
PromptTemplate(
role="user",
content="Provide a concise and informative response to the following consumer health question: {{?question}}"
)
]
),
template_variable_mapping={"question": "topic"},
dataset_config=Dataset("eval-data/testdata/medicalqna_dataset.csv"),
metrics=[
MetricConfig(
reference=MetricRef(id="3ea07c1f-5b10-4b12-bf46-6d429faf8010"),
variable_mapping={"reference": "ground_truth"},
),
],
),
EvaluationConfig(
orchestration_registry_reference="fa938934-ca94-4f8d-b59d-76c4570f0394",
template_variable_mapping={"question": "topic"},
dataset_config=Dataset("eval-data/testdata/medicalqna_dataset.csv"),
metrics=[
MetricConfig(
reference=MetricRef(id="3ea07c1f-5b10-4b12-bf46-6d429faf8010"),
variable_mapping={"reference": "ground_truth"},
),
MetricConfig(
reference=MetricRef(name="Content Filter on Input"),
),
],
),
EvaluationConfig(
llm=LLM(name="gpt-5", version="latest"),
template=TemplateRef(id="73282020-9141-46af-981f-c4816dd01d33"),
template_variable_mapping={"question": "topic"},
dataset_config=Dataset("eval-data/testdata/medicalqna_dataset.csv"),
metrics=[
MetricConfig(
reference=MetricRef(
name="Pointwise Instruction Following",
),
),
],
),
]
Running the evaluate function:
evaluation_runs = client.evaluate(evaluation_config_list)
for current_run in evaluation_runs:
print("values for current run id are ******************** ")
for key, value in vars(current_run).items():
if key == "id" or key == "status":
print(f"{key}: {value}")
Wait till each of the run is completed:
for current_run in evaluation_runs:
print("waiting for the current run id of ", current_run.id)
current_run.wait_for_completion(timeout=3600) # default timeout of 20mins can be overridden by providing a timeout paranmeter in seconds. So in this case it would be 60 min
current_run_0 = evaluation_runs[0]
current_run_1 = evaluation_runs[1]
current_run_2= evaluation_runs[2]
Debugging
To debug the evaluation job in case of failures, can use these helper method on run object to get more details
import json
debug_info = current_run.get_debug_info() # To get the debug information related to status
print("debug info is ", debug_info)
debug_logs = current_run.get_debug_logs() # To see the full trace of logs of the evaluation job
print("Logs of evaluation job are ", json.dumps(debug_logs,indent=4,default=str))
Viewing the Aggregate Results
run1_data = current_run_0.results()
run2_data = current_run_1.results().aggregations()
run3_data = current_run_2.results().aggregations()
print(run1_data) # The aggregations results are fetched from ML Tracking Service
# To get the aggregation results for other runs replace run1_data with other vars
Viewing the Completion Response
data = current_run.results().completions()
def wrap_column(df, col, col_px=200):
df = df.drop(columns=["created_at", "updated_at"])
return (
df.style.set_table_styles([
{"selector": "table", "props": "table-layout: fixed; width: 100%;"},
{"selector": f"td.col{df.columns.get_loc(col)}",
"props": f"max-width: {col_px}px; white-space: pre-wrap; word-break: break-word;"}
], overwrite=False)
)
wrap_column(data.head(), "completion_result", col_px=700)
Viewing the Metric Evaluation response
data = current_run.results().metrics()
def wrap_column(df, col, col_px=200):
df = df.drop(columns=["created_at", "updated_at"])
return (
df.style.set_table_styles([
{"selector": "table", "props": "table-layout: fixed; width: 100%;"},
{"selector": f"td.col{df.columns.get_loc(col)}",
"props": f"max-width: {col_px}px; white-space: pre-wrap; word-break: break-word;"}
], overwrite=False)
)
wrap_column(data.head(), "metric_result", col_px=700)