Azure Machine Learning Automated ML Onnx Model
2 min readApr 9, 2022
Build Automated ML model with Onnx as output
Prerequisites
- Azure Account
- Storage Account
- Azure ML
- Total Size is 1.5 billion rows
- Nyc Taxi yellow data set
Code
- Log into Azure ML workspace
- Create a notebook with Python 3.6 with Azure ML SDK
- Bring the data
from azureml.opendatasets import NycTlcGreen
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedeltagreen_taxi_df = pd.DataFrame([])
start = datetime.strptime("1/1/2015","%m/%d/%Y")
end = datetime.strptime("1/31/2015","%m/%d/%Y")for sample_month in range(12):
temp_df_green = NycTlcGreen(start + relativedelta(months=sample_month), end + relativedelta(months=sample_month)) \
.to_pandas_dataframe()
green_taxi_df = green_taxi_df.append(temp_df_green.sample(2000))green_taxi_df.head(10)columns_to_remove = ["lpepDropoffDatetime", "puLocationId", "doLocationId", "extra", "mtaTax",
"improvementSurcharge", "tollsAmount", "ehailFee", "tripType", "rateCodeID",
"storeAndFwdFlag", "paymentType", "fareAmount", "tipAmount"
]
for col in columns_to_remove:
green_taxi_df.pop(col)green_taxi_df.head(5)final_df = green_taxi_df.query("pickupLatitude>=40.53 and pickupLatitude<=40.88")
final_df = final_df.query("pickupLongitude>=-74.09 and pickupLongitude<=-73.72")
final_df = final_df.query("tripDistance>=0.25 and tripDistance<31")
final_df = final_df.query("passengerCount>0 and totalAmount>0")columns_to_remove_for_training = ["pickupLongitude", "pickupLatitude", "dropoffLongitude", "dropoffLatitude"]
for col in columns_to_remove_for_training:
final_df.pop(col)
- Register the above dataset
from azureml.core import Workspace, Datastore, Dataset
import pandas as pd
ws = Workspace.from_config()
datastore = Datastore.get(ws, 'workspaceblobstore')
dataset = Dataset.Tabular.register_pandas_dataframe(final_df, datastore, "Yellow_taxi", show_progress=True)# Get the training dataset
Yellow_taxi = ws.datasets.get('Yellow_taxi')from azureml.core.workspace import Workspace
ws = Workspace.from_config()from sklearn.model_selection import train_test_splitx_train, x_test = train_test_split(final_df, test_size=0.2, random_state=223)from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetExceptioncluster_name = "cpu-cluster"try:
# Check for existing compute target
training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
print('Found existing cluster, use it.')
except ComputeTargetException:
# If it doesn't already exist, create it
try:
compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
training_cluster.wait_for_completion(show_output=True)
except Exception as ex:
print(ex)import loggingautoml_settings = {
"iteration_timeout_minutes": 18,
"experiment_timeout_hours": 2.0,
"enable_early_stopping": False,
"primary_metric": 'spearman_correlation',
"featurization": 'auto',
"verbosity": logging.INFO,
"n_cross_validations": 5
}from azureml.train.automl import AutoMLConfigautoml_config = AutoMLConfig(task='regression',
debug_log='automated_ml_errors.log',
training_data=Yellow_taxi,
label_column_name="totalAmount",
compute_target=cluster_name,
enable_onnx_compatible_models=True,
**automl_settings)from azureml.core.experiment import Experiment
experiment = Experiment(ws, "Tutorial-NYCTaxi")
remote_run = experiment.submit(automl_config, show_output=True)from azureml.widgets import RunDetails
RunDetails(remote_run).show()best_run, fitted_model = remote_run.get_output()
print(best_run)
print(fitted_model)y_test = x_test.pop("totalAmount")y_predict = fitted_model.predict(x_test)
print(y_predict[:10])from sklearn.metrics import mean_squared_error
from math import sqrty_actual = y_test.values.flatten().tolist()
rmse = sqrt(mean_squared_error(y_actual, y_predict))
rmsesum_actuals = sum_errors = 0for actual_val, predict_val in zip(y_actual, y_predict):
abs_error = actual_val - predict_val
if abs_error < 0:
abs_error = abs_error * -1 sum_errors = sum_errors + abs_error
sum_actuals = sum_actuals + actual_valmean_abs_percent_error = sum_errors / sum_actuals
print("Model MAPE:")
print(mean_abs_percent_error)
print()
print("Model Accuracy:")
print(1 - mean_abs_percent_error)
Originally published at https://github.com.