Azure Machine Learning Automated ML Onnx Model

Balamurugan Balakreshnan
2 min readApr 9, 2022

Build Automated ML model with Onnx as output

Prerequisites

  • Azure Account
  • Storage Account
  • Azure ML
  • Total Size is 1.5 billion rows
  • Nyc Taxi yellow data set

Code

  • Log into Azure ML workspace
  • Create a notebook with Python 3.6 with Azure ML SDK
  • Bring the data
from azureml.opendatasets import NycTlcGreen
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
green_taxi_df = pd.DataFrame([])
start = datetime.strptime("1/1/2015","%m/%d/%Y")
end = datetime.strptime("1/31/2015","%m/%d/%Y")
for sample_month in range(12):
temp_df_green = NycTlcGreen(start + relativedelta(months=sample_month), end + relativedelta(months=sample_month)) \
.to_pandas_dataframe()
green_taxi_df = green_taxi_df.append(temp_df_green.sample(2000))
green_taxi_df.head(10)columns_to_remove = ["lpepDropoffDatetime", "puLocationId", "doLocationId", "extra", "mtaTax",
"improvementSurcharge", "tollsAmount", "ehailFee", "tripType", "rateCodeID",
"storeAndFwdFlag", "paymentType", "fareAmount", "tipAmount"
]
for col in columns_to_remove:
green_taxi_df.pop(col)
green_taxi_df.head(5)final_df = green_taxi_df.query("pickupLatitude>=40.53 and pickupLatitude<=40.88")
final_df = final_df.query("pickupLongitude>=-74.09 and pickupLongitude<=-73.72")
final_df = final_df.query("tripDistance>=0.25 and tripDistance<31")
final_df = final_df.query("passengerCount>0 and totalAmount>0")
columns_to_remove_for_training = ["pickupLongitude", "pickupLatitude", "dropoffLongitude", "dropoffLatitude"]
for col in columns_to_remove_for_training:
final_df.pop(col)
  • Register the above dataset
from azureml.core import Workspace, Datastore, Dataset
import pandas as pd
ws = Workspace.from_config()
datastore = Datastore.get(ws, 'workspaceblobstore')
dataset = Dataset.Tabular.register_pandas_dataframe(final_df, datastore, "Yellow_taxi", show_progress=True)
# Get the training dataset
Yellow_taxi = ws.datasets.get('Yellow_taxi')
from azureml.core.workspace import Workspace
ws = Workspace.from_config()
from sklearn.model_selection import train_test_splitx_train, x_test = train_test_split(final_df, test_size=0.2, random_state=223)from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
cluster_name = "cpu-cluster"try:
# Check for existing compute target
training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
print('Found existing cluster, use it.')
except ComputeTargetException:
# If it doesn't already exist, create it
try:
compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
training_cluster.wait_for_completion(show_output=True)
except Exception as ex:
print(ex)
import loggingautoml_settings = {
"iteration_timeout_minutes": 18,
"experiment_timeout_hours": 2.0,
"enable_early_stopping": False,
"primary_metric": 'spearman_correlation',
"featurization": 'auto',
"verbosity": logging.INFO,
"n_cross_validations": 5
}
from azureml.train.automl import AutoMLConfigautoml_config = AutoMLConfig(task='regression',
debug_log='automated_ml_errors.log',
training_data=Yellow_taxi,
label_column_name="totalAmount",
compute_target=cluster_name,
enable_onnx_compatible_models=True,
**automl_settings)
from azureml.core.experiment import Experiment
experiment = Experiment(ws, "Tutorial-NYCTaxi")
remote_run = experiment.submit(automl_config, show_output=True)
from azureml.widgets import RunDetails
RunDetails(remote_run).show()
best_run, fitted_model = remote_run.get_output()
print(best_run)
print(fitted_model)
y_test = x_test.pop("totalAmount")y_predict = fitted_model.predict(x_test)
print(y_predict[:10])
from sklearn.metrics import mean_squared_error
from math import sqrt
y_actual = y_test.values.flatten().tolist()
rmse = sqrt(mean_squared_error(y_actual, y_predict))
rmse
sum_actuals = sum_errors = 0for actual_val, predict_val in zip(y_actual, y_predict):
abs_error = actual_val - predict_val
if abs_error < 0:
abs_error = abs_error * -1
sum_errors = sum_errors + abs_error
sum_actuals = sum_actuals + actual_val
mean_abs_percent_error = sum_errors / sum_actuals
print("Model MAPE:")
print(mean_abs_percent_error)
print()
print("Model Accuracy:")
print(1 - mean_abs_percent_error)

Originally published at https://github.com.

--

--