How to Build an End-to-End Production Grade Machine Learning Pipeline with ZenML, Including Custom Materializers, Metadata Tracking, and Hyperparameter Optimization


@step(enable_cache=True)
def load_data() -> Annotated[DatasetBundle, "raw_dataset"]:
   data = load_breast_cancer()
   return DatasetBundle(
       data.data, data.target, data.feature_names,
       stats={"source": "sklearn.datasets.load_breast_cancer"},
   )


@step
def split_and_scale(
   bundle: DatasetBundle,
   test_size: float = 0.2,
   random_state: int = 42,
) -> Tuple[
   Annotated[np.ndarray, "X_train"],
   Annotated[np.ndarray, "X_test"],
   Annotated[np.ndarray, "y_train"],
   Annotated[np.ndarray, "y_test"],
]:
   X_tr, X_te, y_tr, y_te = train_test_split(
       bundle.X, bundle.y, test_size=test_size,
       random_state=random_state, stratify=bundle.y,
   )
   scaler = StandardScaler().fit(X_tr)
   X_tr, X_te = scaler.transform(X_tr), scaler.transform(X_te)
   log_metadata(metadata={"train_size": len(X_tr), "test_size": len(X_te)})
   return X_tr, X_te, y_tr, y_te


@step
def train_candidate(
   X_train: np.ndarray,
   y_train: np.ndarray,
   model_type: str = "random_forest",
   n_estimators: int = 100,
   max_depth: int = 5,
) -> Annotated[Any, "candidate_model"]:
   if model_type == "random_forest":
       m = RandomForestClassifier(n_estimators=n_estimators,
                                  max_depth=max_depth, random_state=42)
   elif model_type == "gradient_boosting":
       m = GradientBoostingClassifier(n_estimators=n_estimators,
                                      max_depth=max_depth, random_state=42)
   else:
       m = LogisticRegression(max_iter=2000, random_state=42)
   m.fit(X_train, y_train)
   log_metadata(metadata={
       "model_type": model_type,
       "hyperparameters": {"n_estimators": n_estimators, "max_depth": max_depth},
   })
   return m



Source link

Leave a Reply

Your email address will not be published. Required fields are marked *