Track Hyperparameter Optimization with Optuna and MLflow¶

Import Packages¶

In [1]:

Copied!





import math
import logging
from datetime import datetime, timedelta

import numpy as np
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import mlflow
import math
import logging
from datetime import datetime, timedelta

import numpy as np
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import mlflow

/Users/kcl/.venvs/feast/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

In [2]:

Copied!

MLFLOW_TRACKING_URI = "http://127.0.0.1:50666"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
MLFLOW_TRACKING_URI = "http://127.0.0.1:50666"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [3]:

Copied!

logger = logging.getLogger("mlflow")
logger.setLevel(logging.WARNING)
logger = logging.getLogger("mlflow")
logger.setLevel(logging.WARNING)

Generate Synthetic Data¶

In [4]:

Copied!





def generate_apple_sales_data_with_promo_adjustment(
    base_demand: int = 1000,
    n_rows: int = 5000,
    competitor_price_effect: float = -50.0,
):
    """
    Generates a synthetic dataset for predicting apple sales demand with multiple
    influencing factors.

    This function creates a pandas DataFrame with features relevant to apple sales.
    The features include date, average_temperature, rainfall, weekend flag, holiday flag,
    promotional flag, price_per_kg, competitor's price, marketing intensity, stock availability,
    and the previous day's demand. The target variable, 'demand', is generated based on a
    combination of these features with some added noise.

    Args:
        base_demand (int, optional): Base demand for apples. Defaults to 1000.
        n_rows (int, optional): Number of rows (days) of data to generate. Defaults to 5000.
        competitor_price_effect (float, optional): Effect of competitor's price being lower
                                                   on our sales. Defaults to -50.

    Returns:
        pd.DataFrame: DataFrame with features and target variable for apple sales prediction.

    Example:
        >>> df = generate_apple_sales_data_with_promo_adjustment(base_demand=1200, n_rows=6000)
        >>> df.head()
    """

    # Set seed for reproducibility
    np.random.seed(9999)

    # Create date range
    dates = [datetime.now() - timedelta(days=i) for i in range(n_rows)]
    dates.reverse()

    # Generate features
    df = pd.DataFrame(
        {
            "date": dates,
            "average_temperature": np.random.uniform(10, 35, n_rows),
            "rainfall": np.random.exponential(5, n_rows),
            "weekend": [(date.weekday() >= 5) * 1 for date in dates],
            "holiday": np.random.choice([0, 1], n_rows, p=[0.97, 0.03]),
            "price_per_kg": np.random.uniform(0.5, 3, n_rows),
            "month": [date.month for date in dates],
        }
    )

    # Introduce inflation over time (years)
    df["inflation_multiplier"] = 1 + (df["date"].dt.year - df["date"].dt.year.min()) * 0.03

    # Incorporate seasonality due to apple harvests
    df["harvest_effect"] = np.sin(2 * np.pi * (df["month"] - 3) / 12) + np.sin(
        2 * np.pi * (df["month"] - 9) / 12
    )

    # Modify the price_per_kg based on harvest effect
    df["price_per_kg"] = df["price_per_kg"] - df["harvest_effect"] * 0.5

    # Adjust promo periods to coincide with periods lagging peak harvest by 1 month
    peak_months = [4, 10]  # months following the peak availability
    df["promo"] = np.where(
        df["month"].isin(peak_months),
        1,
        np.random.choice([0, 1], n_rows, p=[0.85, 0.15]),
    )

    # Generate target variable based on features
    base_price_effect = -df["price_per_kg"] * 50
    seasonality_effect = df["harvest_effect"] * 50
    promo_effect = df["promo"] * 200

    df["demand"] = (
        base_demand
        + base_price_effect
        + seasonality_effect
        + promo_effect
        + df["weekend"] * 300
        + np.random.normal(0, 50, n_rows)
    ) * df["inflation_multiplier"]  # adding random noise

    # Add previous day's demand
    df["previous_days_demand"] = df["demand"].shift(1)
    df["previous_days_demand"].fillna(method="bfill", inplace=True)  # fill the first row

    # Introduce competitor pricing
    df["competitor_price_per_kg"] = np.random.uniform(0.5, 3, n_rows)
    df["competitor_price_effect"] = (
        df["competitor_price_per_kg"] < df["price_per_kg"]
    ) * competitor_price_effect

    # Stock availability based on past sales price (3 days lag with logarithmic decay)
    log_decay = -np.log(df["price_per_kg"].shift(3) + 1) + 2
    df["stock_available"] = np.clip(log_decay, 0.7, 1)

    # Marketing intensity based on stock availability
    # Identify where stock is above threshold
    high_stock_indices = df[df["stock_available"] > 0.95].index

    # For each high stock day, increase marketing intensity for the next week
    for idx in high_stock_indices:
        df.loc[idx : min(idx + 7, n_rows - 1), "marketing_intensity"] = np.random.uniform(0.7, 1)

    # If the marketing_intensity column already has values, this will preserve them;
    #  if not, it sets default values
    fill_values = pd.Series(np.random.uniform(0, 0.5, n_rows), index=df.index)
    df["marketing_intensity"].fillna(fill_values, inplace=True)

    # Adjust demand with new factors
    df["demand"] = df["demand"] + df["competitor_price_effect"] + df["marketing_intensity"]

    # Drop temporary columns
    df.drop(
        columns=[
            "inflation_multiplier",
            "harvest_effect",
            "month",
            "competitor_price_effect",
            "stock_available",
        ],
        inplace=True,
    )

    return df
def generate_apple_sales_data_with_promo_adjustment(
    base_demand: int = 1000,
    n_rows: int = 5000,
    competitor_price_effect: float = -50.0,
):
    """
    Generates a synthetic dataset for predicting apple sales demand with multiple
    influencing factors.

    This function creates a pandas DataFrame with features relevant to apple sales.
    The features include date, average_temperature, rainfall, weekend flag, holiday flag,
    promotional flag, price_per_kg, competitor's price, marketing intensity, stock availability,
    and the previous day's demand. The target variable, 'demand', is generated based on a
    combination of these features with some added noise.

    Args:
        base_demand (int, optional): Base demand for apples. Defaults to 1000.
        n_rows (int, optional): Number of rows (days) of data to generate. Defaults to 5000.
        competitor_price_effect (float, optional): Effect of competitor's price being lower
                                                   on our sales. Defaults to -50.

    Returns:
        pd.DataFrame: DataFrame with features and target variable for apple sales prediction.

    Example:
        >>> df = generate_apple_sales_data_with_promo_adjustment(base_demand=1200, n_rows=6000)
        >>> df.head()
    """

    # Set seed for reproducibility
    np.random.seed(9999)

    # Create date range
    dates = [datetime.now() - timedelta(days=i) for i in range(n_rows)]
    dates.reverse()

    # Generate features
    df = pd.DataFrame(
        {
            "date": dates,
            "average_temperature": np.random.uniform(10, 35, n_rows),
            "rainfall": np.random.exponential(5, n_rows),
            "weekend": [(date.weekday() >= 5) * 1 for date in dates],
            "holiday": np.random.choice([0, 1], n_rows, p=[0.97, 0.03]),
            "price_per_kg": np.random.uniform(0.5, 3, n_rows),
            "month": [date.month for date in dates],
        }
    )

    # Introduce inflation over time (years)
    df["inflation_multiplier"] = 1 + (df["date"].dt.year - df["date"].dt.year.min()) * 0.03

    # Incorporate seasonality due to apple harvests
    df["harvest_effect"] = np.sin(2 * np.pi * (df["month"] - 3) / 12) + np.sin(
        2 * np.pi * (df["month"] - 9) / 12
    )

    # Modify the price_per_kg based on harvest effect
    df["price_per_kg"] = df["price_per_kg"] - df["harvest_effect"] * 0.5

    # Adjust promo periods to coincide with periods lagging peak harvest by 1 month
    peak_months = [4, 10]  # months following the peak availability
    df["promo"] = np.where(
        df["month"].isin(peak_months),
        1,
        np.random.choice([0, 1], n_rows, p=[0.85, 0.15]),
    )

    # Generate target variable based on features
    base_price_effect = -df["price_per_kg"] * 50
    seasonality_effect = df["harvest_effect"] * 50
    promo_effect = df["promo"] * 200

    df["demand"] = (
        base_demand
        + base_price_effect
        + seasonality_effect
        + promo_effect
        + df["weekend"] * 300
        + np.random.normal(0, 50, n_rows)
    ) * df["inflation_multiplier"]  # adding random noise

    # Add previous day's demand
    df["previous_days_demand"] = df["demand"].shift(1)
    df["previous_days_demand"].fillna(method="bfill", inplace=True)  # fill the first row

    # Introduce competitor pricing
    df["competitor_price_per_kg"] = np.random.uniform(0.5, 3, n_rows)
    df["competitor_price_effect"] = (
        df["competitor_price_per_kg"] < df["price_per_kg"]
    ) * competitor_price_effect

    # Stock availability based on past sales price (3 days lag with logarithmic decay)
    log_decay = -np.log(df["price_per_kg"].shift(3) + 1) + 2
    df["stock_available"] = np.clip(log_decay, 0.7, 1)

    # Marketing intensity based on stock availability
    # Identify where stock is above threshold
    high_stock_indices = df[df["stock_available"] > 0.95].index

    # For each high stock day, increase marketing intensity for the next week
    for idx in high_stock_indices:
        df.loc[idx : min(idx + 7, n_rows - 1), "marketing_intensity"] = np.random.uniform(0.7, 1)

    # If the marketing_intensity column already has values, this will preserve them;
    #  if not, it sets default values
    fill_values = pd.Series(np.random.uniform(0, 0.5, n_rows), index=df.index)
    df["marketing_intensity"].fillna(fill_values, inplace=True)

    # Adjust demand with new factors
    df["demand"] = df["demand"] + df["competitor_price_effect"] + df["marketing_intensity"]

    # Drop temporary columns
    df.drop(
        columns=[
            "inflation_multiplier",
            "harvest_effect",
            "month",
            "competitor_price_effect",
            "stock_available",
        ],
        inplace=True,
    )

    return df

In [5]:

Copied!

df = generate_apple_sales_data_with_promo_adjustment(base_demand=1_000, n_rows=5000)
df
df = generate_apple_sales_data_with_promo_adjustment(base_demand=1_000, n_rows=5000)
df

/var/folders/zs/82l0dwfx1rdgz3490g0n3_qw0000gn/T/ipykernel_3699/1129670120.py:85: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["previous_days_demand"].fillna(method="bfill", inplace=True)  # fill the first row
/var/folders/zs/82l0dwfx1rdgz3490g0n3_qw0000gn/T/ipykernel_3699/1129670120.py:85: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  df["previous_days_demand"].fillna(method="bfill", inplace=True)  # fill the first row
/var/folders/zs/82l0dwfx1rdgz3490g0n3_qw0000gn/T/ipykernel_3699/1129670120.py:108: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["marketing_intensity"].fillna(fill_values, inplace=True)

Out[5]:

	date	average_temperature	rainfall	weekend	holiday	price_per_kg	promo	demand	previous_days_demand	competitor_price_per_kg	marketing_intensity
0	2011-08-24 16:51:35.064975	30.584727	1.199291	0	0	1.726258	0	851.375336	851.276659	1.935346	0.098677
1	2011-08-25 16:51:35.064973	15.465069	1.037626	0	0	0.576471	0	906.855943	851.276659	2.344720	0.019318
2	2011-08-26 16:51:35.064971	10.786525	5.656089	0	0	2.513328	0	808.304909	906.836626	0.998803	0.409485
3	2011-08-27 16:51:35.064970	23.648154	12.030937	1	0	1.839225	0	1099.833810	857.895424	0.761740	0.872803
4	2011-08-28 16:51:35.064967	13.861391	4.303812	1	0	1.531772	0	1283.949061	1148.961007	2.123436	0.820779
...	...	...	...	...	...	...	...	...	...	...	...
4995	2025-04-27 16:51:35.054780	21.643051	3.821656	1	0	2.391010	1	1875.882437	1880.799278	1.504432	0.756489
4996	2025-04-28 16:51:35.054778	13.808813	1.080603	0	1	0.898693	1	1596.870527	1925.125948	1.343586	0.742145
4997	2025-04-29 16:51:35.054775	11.698227	1.911000	0	0	2.839860	1	1271.065524	1596.128382	2.771896	0.742145
4998	2025-04-30 16:51:35.054772	18.052081	1.000521	0	0	1.188440	1	1681.886638	1320.323379	2.564075	0.742145
4999	2025-05-01 16:51:35.054738	17.017294	0.650213	0	0	2.131694	0	1289.584771	1681.144493	0.785727	0.833140

5000 rows × 11 columns

In [6]:

Copied!





# Preprocess the dataset
X = df.drop(columns=["date", "demand"])
y = df["demand"]
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.25)
dtrain = xgb.DMatrix(train_x, label=train_y)
dvalid = xgb.DMatrix(valid_x, label=valid_y)
# Preprocess the dataset
X = df.drop(columns=["date", "demand"])
y = df["demand"]
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.25)
dtrain = xgb.DMatrix(train_x, label=train_y)
dvalid = xgb.DMatrix(valid_x, label=valid_y)

Define Plotting Functions¶

plot_correlation_with_demand
plot_residuals
plot_feature_importance

In [7]:

Copied!





import matplotlib.pyplot as plt
import seaborn as sns


def plot_correlation_with_demand(df, save_path=None):  # noqa: D417
    """
    Plots the correlation of each variable in the dataframe with the 'demand' column.

    Args:
    - df (pd.DataFrame): DataFrame containing the data, including a 'demand' column.
    - save_path (str, optional): Path to save the generated plot. If not specified, plot won't be saved.

    Returns:
    - None (Displays the plot on a Jupyter window)
    """

    # Compute correlations between all variables and 'demand'
    correlations = df.corr()["demand"].drop("demand").sort_values()

    # Generate a color palette from red to green
    colors = sns.diverging_palette(10, 130, as_cmap=True)
    color_mapped = correlations.map(colors)

    # Set Seaborn style
    sns.set_style(
        "whitegrid", {"axes.facecolor": "#c2c4c2", "grid.linewidth": 1.5}
    )  # Light grey background and thicker grid lines

    # Create bar plot
    fig = plt.figure(figsize=(12, 8))
    plt.barh(correlations.index, correlations.values, color=color_mapped)

    # Set labels and title with increased font size
    plt.title("Correlation with Demand", fontsize=18)
    plt.xlabel("Correlation Coefficient", fontsize=16)
    plt.ylabel("Variable", fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.grid(axis="x")

    plt.tight_layout()

    # Save the plot if save_path is specified
    if save_path:
        plt.savefig(save_path, format="png", dpi=600)

    # prevent matplotlib from displaying the chart every time we call this function
    plt.close(fig)

    return fig


# Test the function
correlation_plot = plot_correlation_with_demand(df, save_path="correlation_plot.png")
import matplotlib.pyplot as plt
import seaborn as sns


def plot_correlation_with_demand(df, save_path=None):  # noqa: D417
    """
    Plots the correlation of each variable in the dataframe with the 'demand' column.

    Args:
    - df (pd.DataFrame): DataFrame containing the data, including a 'demand' column.
    - save_path (str, optional): Path to save the generated plot. If not specified, plot won't be saved.

    Returns:
    - None (Displays the plot on a Jupyter window)
    """

    # Compute correlations between all variables and 'demand'
    correlations = df.corr()["demand"].drop("demand").sort_values()

    # Generate a color palette from red to green
    colors = sns.diverging_palette(10, 130, as_cmap=True)
    color_mapped = correlations.map(colors)

    # Set Seaborn style
    sns.set_style(
        "whitegrid", {"axes.facecolor": "#c2c4c2", "grid.linewidth": 1.5}
    )  # Light grey background and thicker grid lines

    # Create bar plot
    fig = plt.figure(figsize=(12, 8))
    plt.barh(correlations.index, correlations.values, color=color_mapped)

    # Set labels and title with increased font size
    plt.title("Correlation with Demand", fontsize=18)
    plt.xlabel("Correlation Coefficient", fontsize=16)
    plt.ylabel("Variable", fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.grid(axis="x")

    plt.tight_layout()

    # Save the plot if save_path is specified
    if save_path:
        plt.savefig(save_path, format="png", dpi=600)

    # prevent matplotlib from displaying the chart every time we call this function
    plt.close(fig)

    return fig


# Test the function
correlation_plot = plot_correlation_with_demand(df, save_path="correlation_plot.png")

In [8]:

Copied!





def plot_residuals(model, dvalid, valid_y, save_path=None):  # noqa: D417
    """
    Plots the residuals of the model predictions against the true values.

    Args:
    - model: The trained XGBoost model.
    - dvalid (xgb.DMatrix): The validation data in XGBoost DMatrix format.
    - valid_y (pd.Series): The true values for the validation set.
    - save_path (str, optional): Path to save the generated plot. If not specified, plot won't be saved.

    Returns:
    - None (Displays the residuals plot on a Jupyter window)
    """

    # Predict using the model
    preds = model.predict(dvalid)

    # Calculate residuals
    residuals = valid_y - preds

    # Set Seaborn style
    sns.set_style("whitegrid", {"axes.facecolor": "#c2c4c2", "grid.linewidth": 1.5})

    # Create scatter plot
    fig = plt.figure(figsize=(12, 8))
    plt.scatter(valid_y, residuals, color="blue", alpha=0.5)
    plt.axhline(y=0, color="r", linestyle="-")

    # Set labels, title and other plot properties
    plt.title("Residuals vs True Values", fontsize=18)
    plt.xlabel("True Values", fontsize=16)
    plt.ylabel("Residuals", fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.grid(axis="y")

    plt.tight_layout()

    # Save the plot if save_path is specified
    if save_path:
        plt.savefig(save_path, format="png", dpi=600)

    # Show the plot
    plt.close(fig)

    return fig
def plot_residuals(model, dvalid, valid_y, save_path=None):  # noqa: D417
    """
    Plots the residuals of the model predictions against the true values.

    Args:
    - model: The trained XGBoost model.
    - dvalid (xgb.DMatrix): The validation data in XGBoost DMatrix format.
    - valid_y (pd.Series): The true values for the validation set.
    - save_path (str, optional): Path to save the generated plot. If not specified, plot won't be saved.

    Returns:
    - None (Displays the residuals plot on a Jupyter window)
    """

    # Predict using the model
    preds = model.predict(dvalid)

    # Calculate residuals
    residuals = valid_y - preds

    # Set Seaborn style
    sns.set_style("whitegrid", {"axes.facecolor": "#c2c4c2", "grid.linewidth": 1.5})

    # Create scatter plot
    fig = plt.figure(figsize=(12, 8))
    plt.scatter(valid_y, residuals, color="blue", alpha=0.5)
    plt.axhline(y=0, color="r", linestyle="-")

    # Set labels, title and other plot properties
    plt.title("Residuals vs True Values", fontsize=18)
    plt.xlabel("True Values", fontsize=16)
    plt.ylabel("Residuals", fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.grid(axis="y")

    plt.tight_layout()

    # Save the plot if save_path is specified
    if save_path:
        plt.savefig(save_path, format="png", dpi=600)

    # Show the plot
    plt.close(fig)

    return fig

In [9]:

Copied!





def plot_feature_importance(model, booster):  # noqa: D417
    """
    Plots feature importance for an XGBoost model.

    Args:
    - model: A trained XGBoost model

    Returns:
    - fig: The matplotlib figure object
    """
    fig, ax = plt.subplots(figsize=(10, 8))
    importance_type = "weight" if booster == "gblinear" else "gain"
    xgb.plot_importance(
        model,
        importance_type=importance_type,
        ax=ax,
        title=f"Feature Importance based on {importance_type}",
    )
    plt.tight_layout()
    plt.close(fig)

    return fig
def plot_feature_importance(model, booster):  # noqa: D417
    """
    Plots feature importance for an XGBoost model.

    Args:
    - model: A trained XGBoost model

    Returns:
    - fig: The matplotlib figure object
    """
    fig, ax = plt.subplots(figsize=(10, 8))
    importance_type = "weight" if booster == "gblinear" else "gain"
    xgb.plot_importance(
        model,
        importance_type=importance_type,
        ax=ax,
        title=f"Feature Importance based on {importance_type}",
    )
    plt.tight_layout()
    plt.close(fig)

    return fig

Set up the Experiment¶

In [10]:

Copied!





def get_or_create_experiment(experiment_name):
    """
    Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

    This function checks if an experiment with the given name exists within MLflow.
    If it does, the function returns its ID. If not, it creates a new experiment
    with the provided name and returns its ID.

    Parameters:
    - experiment_name (str): Name of the MLflow experiment.

    Returns:
    - str: ID of the existing or newly created MLflow experiment.
    """

    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)
def get_or_create_experiment(experiment_name):
    """
    Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

    This function checks if an experiment with the given name exists within MLflow.
    If it does, the function returns its ID. If not, it creates a new experiment
    with the provided name and returns its ID.

    Parameters:
    - experiment_name (str): Name of the MLflow experiment.

    Returns:
    - str: ID of the existing or newly created MLflow experiment.
    """

    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)

In [11]:

Copied!

experiment_id = get_or_create_experiment("Apples Demand")
experiment_id
experiment_id = get_or_create_experiment("Apples Demand")
experiment_id

Out[11]:

'1'

In [12]:

Copied!

# Set the current active MLflow experiment
mlflow.set_experiment(experiment_id=experiment_id)
# Set the current active MLflow experiment
mlflow.set_experiment(experiment_id=experiment_id)

Out[12]:

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1746089509195, experiment_id='1', last_update_time=1746089509195, lifecycle_stage='active', name='Apples Demand', tags={}>

Create a Callback Function¶

In [13]:

Copied!

# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)
# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)

In [14]:

Copied!





# define a logging callback that will report on only new challenger parameter configurations if a
# trial has usurped the state of 'best conditions'
def champion_callback(study, frozen_trial):
    """
    Logging callback that will report when a new trial iteration improves upon existing
    best trial values.

    Note: This callback is not intended for use in distributed computing systems such as Spark
    or Ray due to the micro-batch iterative implementation for distributing trials to a cluster's
    workers or agents.
    The race conditions with file system state management for distributed trials will render
    inconsistent values with this callback.
    """

    prev_best_value = study.user_attrs.get("prev_best_value", None)

    if study.best_value and prev_best_value != study.best_value:
        study.set_user_attr("prev_best_value", study.best_value)

        # MLflow log when a new best trial is found
        full_params = frozen_trial.user_attrs.get("full_params", {})
        mse = frozen_trial.user_attrs.get("mse", None)
        with mlflow.start_run(run_name=f"best-trial-{frozen_trial.number}", nested=True):
            mlflow.log_params(full_params)
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("rmse", math.sqrt(mse))

        if prev_best_value:
            improvement_percent = (abs(prev_best_value - study.best_value) / study.best_value) * 100
            print(
                f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
                f"{improvement_percent: .4f}% improvement"
            )
        else:
            print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")
# define a logging callback that will report on only new challenger parameter configurations if a
# trial has usurped the state of 'best conditions'
def champion_callback(study, frozen_trial):
    """
    Logging callback that will report when a new trial iteration improves upon existing
    best trial values.

    Note: This callback is not intended for use in distributed computing systems such as Spark
    or Ray due to the micro-batch iterative implementation for distributing trials to a cluster's
    workers or agents.
    The race conditions with file system state management for distributed trials will render
    inconsistent values with this callback.
    """

    prev_best_value = study.user_attrs.get("prev_best_value", None)

    if study.best_value and prev_best_value != study.best_value:
        study.set_user_attr("prev_best_value", study.best_value)

        # MLflow log when a new best trial is found
        full_params = frozen_trial.user_attrs.get("full_params", {})
        mse = frozen_trial.user_attrs.get("mse", None)
        with mlflow.start_run(run_name=f"best-trial-{frozen_trial.number}", nested=True):
            mlflow.log_params(full_params)
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("rmse", math.sqrt(mse))

        if prev_best_value:
            improvement_percent = (abs(prev_best_value - study.best_value) / study.best_value) * 100
            print(
                f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
                f"{improvement_percent: .4f}% improvement"
            )
        else:
            print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")

Create a Objective Function¶

In [15]:

Copied!





def objective(trial):
    # Define hyperparameters
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
    }

    if params["booster"] == "gbtree" or params["booster"] == "dart":
        params["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        params["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        params["grow_policy"] = trial.suggest_categorical(
            "grow_policy", ["depthwise", "lossguide"]
        )
    
    # Train XGBoost model
    bst = xgb.train(params, dtrain)
    pred_y = bst.predict(dvalid)
    mse = mean_squared_error(valid_y, pred_y)

    trial.set_user_attr("full_params", params)
    trial.set_user_attr("mse", mse)

    return mse
def objective(trial):
    # Define hyperparameters
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
    }

    if params["booster"] == "gbtree" or params["booster"] == "dart":
        params["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        params["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        params["grow_policy"] = trial.suggest_categorical(
            "grow_policy", ["depthwise", "lossguide"]
        )
    
    # Train XGBoost model
    bst = xgb.train(params, dtrain)
    pred_y = bst.predict(dvalid)
    mse = mean_squared_error(valid_y, pred_y)

    trial.set_user_attr("full_params", params)
    trial.set_user_attr("mse", mse)

    return mse

In [16]:

Copied!





# Initiate the parent run and call the hyperparameter tuning child run logic
run_name = "fourth"
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name, nested=True):
    # Initialize the Optuna study
    study = optuna.create_study(direction="minimize")

    # Execute the hyperparameter optimization trials.
    # Note the addition of the `champion_callback` inclusion to control our logging
    study.optimize(objective, n_trials=100, callbacks=[champion_callback])

    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_mse", study.best_value)
    mlflow.log_metric("best_rmse", math.sqrt(study.best_value))

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "Apple Demand Project",
            "optimizer_engine": "optuna",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    model = xgb.train(study.best_params, dtrain)

    # Log the correlation plot
    mlflow.log_figure(figure=correlation_plot, artifact_file="correlation_plot.png")

    # Log the feature importances plot
    importances = plot_feature_importance(model, booster=study.best_params.get("booster"))
    mlflow.log_figure(figure=importances, artifact_file="feature_importances.png")

    # Log the residuals plot
    residuals = plot_residuals(model, dvalid, valid_y)
    mlflow.log_figure(figure=residuals, artifact_file="residuals.png")

    artifact_path = "model"

    mlflow.xgboost.log_model(
        xgb_model=model,
        artifact_path=artifact_path,
        input_example=train_x.iloc[[0]],
        model_format="ubj",
        metadata={"model_data_version": 1},
    )

    # Get the logged model uri so that we can load it from the artifact store
    model_uri = mlflow.get_artifact_uri(artifact_path)
# Initiate the parent run and call the hyperparameter tuning child run logic
run_name = "fourth"
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name, nested=True):
    # Initialize the Optuna study
    study = optuna.create_study(direction="minimize")

    # Execute the hyperparameter optimization trials.
    # Note the addition of the `champion_callback` inclusion to control our logging
    study.optimize(objective, n_trials=100, callbacks=[champion_callback])

    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_mse", study.best_value)
    mlflow.log_metric("best_rmse", math.sqrt(study.best_value))

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "Apple Demand Project",
            "optimizer_engine": "optuna",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    model = xgb.train(study.best_params, dtrain)

    # Log the correlation plot
    mlflow.log_figure(figure=correlation_plot, artifact_file="correlation_plot.png")

    # Log the feature importances plot
    importances = plot_feature_importance(model, booster=study.best_params.get("booster"))
    mlflow.log_figure(figure=importances, artifact_file="feature_importances.png")

    # Log the residuals plot
    residuals = plot_residuals(model, dvalid, valid_y)
    mlflow.log_figure(figure=residuals, artifact_file="residuals.png")

    artifact_path = "model"

    mlflow.xgboost.log_model(
        xgb_model=model,
        artifact_path=artifact_path,
        input_example=train_x.iloc[[0]],
        model_format="ubj",
        metadata={"model_data_version": 1},
    )

    # Get the logged model uri so that we can load it from the artifact store
    model_uri = mlflow.get_artifact_uri(artifact_path)

🏃 View run best-trial-0 at: http://127.0.0.1:50666/#/experiments/1/runs/25de8f9716124a0e953ec50a03a85bdd
🧪 View experiment at: http://127.0.0.1:50666/#/experiments/1
Initial trial 0 achieved value: 64200.9377312124
🏃 View run best-trial-2 at: http://127.0.0.1:50666/#/experiments/1/runs/f8c4f4c28f7346f39e513beaef0b2eb2
🧪 View experiment at: http://127.0.0.1:50666/#/experiments/1
Trial 2 achieved value: 64072.863216692545 with  0.1999% improvement
🏃 View run best-trial-7 at: http://127.0.0.1:50666/#/experiments/1/runs/a7c0d3ff781b4722bf6a8f4a1c29cc21
🧪 View experiment at: http://127.0.0.1:50666/#/experiments/1
Trial 7 achieved value: 19370.33276815125 with  230.7783% improvement
🏃 View run best-trial-9 at: http://127.0.0.1:50666/#/experiments/1/runs/b50292d1b7f54e4ab43bcd950c901853
🧪 View experiment at: http://127.0.0.1:50666/#/experiments/1
Trial 9 achieved value: 19280.226444123156 with  0.4674% improvement
🏃 View run best-trial-12 at: http://127.0.0.1:50666/#/experiments/1/runs/ec7572de81d74d2e814a246deb279585
🧪 View experiment at: http://127.0.0.1:50666/#/experiments/1
Trial 12 achieved value: 19249.251703373608 with  0.1609% improvement
🏃 View run best-trial-15 at: http://127.0.0.1:50666/#/experiments/1/runs/3a5f94921c5f425599caf305b7bcea63
🧪 View experiment at: http://127.0.0.1:50666/#/experiments/1
Trial 15 achieved value: 19143.276660193736 with  0.5536% improvement
🏃 View run best-trial-26 at: http://127.0.0.1:50666/#/experiments/1/runs/2d64a8f2a9d046379f79c8fd2ed84da8
🧪 View experiment at: http://127.0.0.1:50666/#/experiments/1
Trial 26 achieved value: 19042.25204424367 with  0.5305% improvement
🏃 View run best-trial-40 at: http://127.0.0.1:50666/#/experiments/1/runs/0ff1b4a5bba7468abf7221b0c2567857
🧪 View experiment at: http://127.0.0.1:50666/#/experiments/1
Trial 40 achieved value: 18255.62810065196 with  4.3089% improvement
🏃 View run best-trial-70 at: http://127.0.0.1:50666/#/experiments/1/runs/8688d93cb6b2495eaca05a8d831be39a
🧪 View experiment at: http://127.0.0.1:50666/#/experiments/1
Trial 70 achieved value: 16690.920638134605 with  9.3746% improvement
🏃 View run best-trial-71 at: http://127.0.0.1:50666/#/experiments/1/runs/08f702060d76408a92ba803e459b9d38
🧪 View experiment at: http://127.0.0.1:50666/#/experiments/1
Trial 71 achieved value: 15909.402041712121 with  4.9123% improvement
🏃 View run best-trial-72 at: http://127.0.0.1:50666/#/experiments/1/runs/e89e6529f4174a8a93d7f78c5a8f2a61
🧪 View experiment at: http://127.0.0.1:50666/#/experiments/1
Trial 72 achieved value: 15573.390403470594 with  2.1576% improvement
🏃 View run best-trial-76 at: http://127.0.0.1:50666/#/experiments/1/runs/07bef624d7084837a4de95636ae72508
🧪 View experiment at: http://127.0.0.1:50666/#/experiments/1
Trial 76 achieved value: 15331.608560002875 with  1.5770% improvement
🏃 View run best-trial-85 at: http://127.0.0.1:50666/#/experiments/1/runs/094dec77387f47ebad1869b6ca2487a8
🧪 View experiment at: http://127.0.0.1:50666/#/experiments/1
Trial 85 achieved value: 15330.17016465973 with  0.0094% improvement

/Users/kcl/.venvs/feast/lib/python3.10/site-packages/mlflow/types/utils.py:452: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
  warnings.warn(

🏃 View run fourth at: http://127.0.0.1:50666/#/experiments/1/runs/b1d6cefb8b9c434895e7627fe7529e4e
🧪 View experiment at: http://127.0.0.1:50666/#/experiments/1

In [17]:

Copied!

model_uri
model_uri

Out[17]:

'mlflow-artifacts:/1/b1d6cefb8b9c434895e7627fe7529e4e/artifacts/model'

Load the Model and Run Batch Prediction¶

In [35]:

Copied!

loaded = mlflow.xgboost.load_model(model_uri)
loaded = mlflow.xgboost.load_model(model_uri)

Downloading artifacts: 100%|████████████████████████████████████████████████████████████████████████████████| 7/7 [00:22<00:00,  3.18s/it]

In [36]:

Copied!

batch_dmatrix = xgb.DMatrix(X)

inference = loaded.predict(batch_dmatrix)

infer_df = df.copy()

infer_df["predicted_demand"] = inference
batch_dmatrix = xgb.DMatrix(X)

inference = loaded.predict(batch_dmatrix)

infer_df = df.copy()

infer_df["predicted_demand"] = inference

In [37]:

Copied!

infer_df
infer_df

Out[37]:

	date	average_temperature	rainfall	weekend	holiday	price_per_kg	promo	demand	previous_days_demand	competitor_price_per_kg	marketing_intensity	predicted_demand
0	2011-08-22 16:36:16.256208	30.584727	1.199291	0	0	1.726258	0	851.375336	851.276659	1.935346	0.098677	938.926270
1	2011-08-23 16:36:16.256201	15.465069	1.037626	0	0	0.576471	0	906.855943	851.276659	2.344720	0.019318	1016.131104
2	2011-08-24 16:36:16.256200	10.786525	5.656089	0	0	2.513328	0	808.304909	906.836626	0.998803	0.409485	888.394958
3	2011-08-25 16:36:16.256198	23.648154	12.030937	0	0	1.839225	0	799.833810	857.895424	0.761740	0.872803	928.042908
4	2011-08-26 16:36:16.256197	13.861391	4.303812	0	0	1.531772	0	983.949061	848.961007	2.123436	0.820779	985.474487
...	...	...	...	...	...	...	...	...	...	...	...	...
4995	2025-04-25 16:36:16.246942	21.643051	3.821656	0	0	2.391010	1	1449.882437	1454.799278	1.504432	0.756489	1287.835571
4996	2025-04-26 16:36:16.246940	13.808813	1.080603	1	1	0.898693	1	2022.870527	1499.125948	1.343586	0.742145	1742.767700
4997	2025-04-27 16:36:16.246939	11.698227	1.911000	1	0	2.839860	1	1697.065524	2022.128382	2.771896	0.742145	1757.431885
4998	2025-04-28 16:36:16.246935	18.052081	1.000521	0	0	1.188440	1	1681.886638	1746.323379	2.564075	0.742145	1473.662109
4999	2025-04-29 16:36:16.246853	17.017294	0.650213	0	0	2.131694	1	1573.584771	1681.144493	0.785727	0.833140	1346.723755

5000 rows × 12 columns