Generative AI Custom Evaluation

This is an example notebook which showcases how a user can use Evaluations SDK to benchmark their large language models, evaluate orchestration configuration or prompts for their use case. It uses publicly available MedicationQA dataset which consists of commonly asked consumer questions about medications. The workload computes industry standard metrics to check the reliability of the response generate by llm.

Setup

# Loading the credentials from the env file
from gen_ai_hub.evaluations import EvaluationClient
from dotenv import load_dotenv
import os

load_dotenv(override=True)

AICORE_BASE_URL = os.getenv("AICORE_BASE_URL")
AICORE_RESOURCE_GROUP = os.getenv("AICORE_RESOURCE_GROUP")
AICORE_AUTH_URL = os.getenv("AICORE_AUTH_URL")
AICORE_CLIENT_ID = os.getenv("AICORE_CLIENT_ID")
AICORE_CLIENT_SECRET = os.getenv("AICORE_CLIENT_SECRET")

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_BUCKET_ID = os.getenv("AWS_BUCKET_ID")
AWS_REGION = os.getenv("AWS_REGION")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
ORCHESTRATION_URL = os.getenv("ORCHESTRATION_URL")


client = EvaluationClient(
        # direct ai_core_client can be added as a parameter if already created
        base_url=AICORE_BASE_URL,
        auth_url=AICORE_AUTH_URL,
        client_id=AICORE_CLIENT_ID,
        client_secret=AICORE_CLIENT_SECRET,
        resource_group=AICORE_RESOURCE_GROUP,
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        orchestration_url=ORCHESTRATION_URL
    )

# One more way to initialize the client
# client = EvaluationClient.from_env()

print(client.base_url)
print(client.ai_core_client.object_store_secrets.query(top=10,resource_group=AICORE_RESOURCE_GROUP))

Note: If the user is using the default resource group, an orchestration URL is already available and will be selected automatically. If the user has previously created object store secrets, this setup step can be skipped.

One Time Creation of Secrets

import json

AWS_S3_ENDPOINT = "s3-eu-central-1.amazonaws.com"

# default secret is needed to store output artifacts that the evaluation job creates after it is completed
default_secret_creds = {
    "data": {},
    "type": "S3",
    "pathPrefix": "sdkOutputFiles",
    "endpoint": AWS_S3_ENDPOINT,
    "bucket": AWS_BUCKET_ID,
    "region": AWS_REGION,
    "usehttps": "1",
}

# input secret is used to load input artifacts required by the evaluation job. 
# This is optional as these files can be loaded via default secret path as well.
input_secret_creds = {
    "data": {},
    "name": "sdk-data",
    "type": "S3",
    "pathPrefix": "sdk_input_files/data",
    "endpoint": AWS_S3_ENDPOINT,
    "bucket": AWS_BUCKET_ID,
    "region": AWS_REGION,
    "usehttps": "1",
}

# Function Scope:
# Creation of object store secrets and creates orchestration deployment url if not passed via initialization.
response = client.setup(
    default_secret_body=default_secret_creds, input_secret_body=input_secret_creds, replace_existing=True
)

print(json.dumps(response, indent=4, ensure_ascii=False))

Helper function to list available models in the region

models_list = client.list_available_models()
model_names = [m["model"] for m in models_list]
print(model_names)

Helper function to see available System Defined Metrics

metrics_list = client.get_system_supported_metrics() # Fetches metrics info from Metric Management Service.
metric_names = [m["name"] for m in metrics_list]
print(metric_names)

Defining the Evaluation Config

To know more about the structure and type of the parameters in any of the functions, one can just do help on it and see the docstring. For example this is for EvaluationConfig

from gen_ai_hub.evaluations import EvaluationConfig
print(help(EvaluationConfig))

Helper function to generate Evaluation Config

There are multiple ways to create EvaluationConfig. With prompt template (inline or reference) or orchestration registry reference, with multiple metric configurations in each EvaluationConfig.

from gen_ai_hub.prompt_registry import (
        PromptTemplateSpec,
        PromptTemplate,
    )
from gen_ai_hub.orchestration.models.template_ref import TemplateRef
from gen_ai_hub.evaluations import Dataset, MetricConfig, MetricRef, EvaluationConfig
from gen_ai_hub.orchestration.models.llm import LLM

evaluation_config_list = [
        EvaluationConfig(
            llm=LLM(name="gpt-4o", version="latest"),
            template=PromptTemplateSpec(
                template=[
                    PromptTemplate(
                        role="user", 
                        content="Provide a concise and informative response to the following consumer health question: {{?question}}"
                    )
                ]
            ),        
            template_variable_mapping={"question": "topic"},
            dataset_config=Dataset("eval-data/testdata/medicalqna_dataset.csv"),
            metrics=[
                MetricConfig(
                    reference=MetricRef(id="3ea07c1f-5b10-4b12-bf46-6d429faf8010"),
                    variable_mapping={"reference": "ground_truth"},
                ),
            ],
        ),
        EvaluationConfig(
            orchestration_registry_reference="fa938934-ca94-4f8d-b59d-76c4570f0394",
            template_variable_mapping={"question": "topic"},
            dataset_config=Dataset("eval-data/testdata/medicalqna_dataset.csv"),
            metrics=[
                MetricConfig(
                    reference=MetricRef(id="3ea07c1f-5b10-4b12-bf46-6d429faf8010"),
                    variable_mapping={"reference": "ground_truth"},
                ),
                MetricConfig(
                    reference=MetricRef(name="Content Filter on Input"),
                ), 
            ],
        ),
        EvaluationConfig(
            llm=LLM(name="gpt-5", version="latest"),
            template=TemplateRef(id="73282020-9141-46af-981f-c4816dd01d33"),
            template_variable_mapping={"question": "topic"},
            dataset_config=Dataset("eval-data/testdata/medicalqna_dataset.csv"),
            metrics=[
                MetricConfig(
                    reference=MetricRef(
                        name="Pointwise Instruction Following",
                    ),
                ),
            ],
        ),
    ]

Running the evaluate function:

evaluation_runs = client.evaluate(evaluation_config_list)

for current_run in evaluation_runs:
    print("values for current run id are ******************** ")
    for key, value in vars(current_run).items():
        if key == "id" or key == "status":
            print(f"{key}: {value}")

Wait till each of the run is completed:

for current_run in evaluation_runs:
    print("waiting for the current run id of ", current_run.id)
    current_run.wait_for_completion(timeout=3600) # default timeout of 20mins can be overridden by providing a timeout paranmeter in seconds. So in this case it would be 60 min

current_run_0 = evaluation_runs[0]
current_run_1 = evaluation_runs[1]
current_run_2= evaluation_runs[2]

Debugging

To debug the evaluation job in case of failures, can use these helper method on run object to get more details

import json

debug_info = current_run.get_debug_info() # To get the debug information related to status
print("debug info is ", debug_info)

debug_logs = current_run.get_debug_logs() # To see the full trace of logs of the evaluation job
print("Logs of evaluation job are ", json.dumps(debug_logs,indent=4,default=str))

Viewing the Aggregate Results

run1_data = current_run_0.results()
run2_data = current_run_1.results().aggregations()
run3_data = current_run_2.results().aggregations()

print(run1_data) # The aggregations results are fetched from ML Tracking Service
# To get the aggregation results for other runs replace run1_data with other vars

Viewing the Completion Response

data = current_run.results().completions()

def wrap_column(df, col, col_px=200):
    df = df.drop(columns=["created_at", "updated_at"])
    return (
        df.style.set_table_styles([
            {"selector": "table", "props": "table-layout: fixed; width: 100%;"},
            {"selector": f"td.col{df.columns.get_loc(col)}",
             "props": f"max-width: {col_px}px; white-space: pre-wrap; word-break: break-word;"}
        ], overwrite=False)
    )

wrap_column(data.head(), "completion_result", col_px=700)

Viewing the Metric Evaluation response

data = current_run.results().metrics()

def wrap_column(df, col, col_px=200):
    df = df.drop(columns=["created_at", "updated_at"])
    return (
        df.style.set_table_styles([
            {"selector": "table", "props": "table-layout: fixed; width: 100%;"},
            {"selector": f"td.col{df.columns.get_loc(col)}",
             "props": f"max-width: {col_px}px; white-space: pre-wrap; word-break: break-word;"}
        ], overwrite=False)
    )

wrap_column(data.head(), "metric_result", col_px=700)