app_case.py - example.
An example of a supervised regression case.
This app is used to verify project workflow.
Author: Denise Case
Date: 2026-06
Process
- Load a CSV dataset.
- Train a supervised regression model.
- Evaluate model performance.
- Predict one new case.
- Create useful charts.
Data Source:
- data/raw/hours_scores_case.csv
Terminal command to run this file from the root project folder:
uv run python -m mlstudio.app_case
OBS
Don't edit this file - it should remain a working example.
It is used in each module to test the installation and workflow.
You never need to do anything with it, but if would like,
you can copy it, rename it, and modify your copy.
If you do, include your command to run it in the docstring above and in README.md.
DATASET_NAME
module-attribute
DATASET_NAME: Final[str] = 'hours_scores_case'
FEATURE_COLS
module-attribute
FEATURE_COLS: Final[list[str]] = [
'hours_studied',
'practice_quizzes',
'attendance_pct',
'sleep_hours',
'prior_score',
]
LOG
module-attribute
LOG: Logger = get_logger('ML', level='DEBUG')
RANDOM_STATE
module-attribute
RANDOM_STATE: Final[int] = 42
TARGET_COL
module-attribute
TARGET_COL: Final[str] = 'score'
TEST_SIZE
module-attribute
TEST_SIZE: Final[float] = 0.3
check_quality
check_quality(df: DataFrame) -> None
Check missing values and duplicate rows.
Source code in src/mlstudio/app_case.py
| def check_quality(df: pd.DataFrame) -> None:
"""Check missing values and duplicate rows."""
LOG.info("Missing values by column")
LOG.debug(f"\n{df.isna().sum()}")
duplicate_count: int = df.duplicated().sum()
LOG.info(f"Duplicate row count: {duplicate_count}")
|
inspect_basic
inspect_basic(df: DataFrame) -> None
Inspect basic dataset structure.
Source code in src/mlstudio/app_case.py
| def inspect_basic(df: pd.DataFrame) -> None:
"""Inspect basic dataset structure."""
LOG.info("Column names")
LOG.debug(f"{list(df.columns)}")
LOG.info("DataFrame info")
df.info()
LOG.info(f"Dataset shape: {df.shape[0]} rows, {df.shape[1]} columns")
|
load_data
load_data() -> pd.DataFrame
Load the case dataset from the data/raw folder.
Source code in src/mlstudio/app_case.py
| def load_data() -> pd.DataFrame:
"""Load the case dataset from the data/raw folder."""
LOG.info(f"Loading dataset: {DATASET_NAME}")
df: pd.DataFrame = pd.read_csv(f"data/raw/{DATASET_NAME}.csv")
LOG.info(f"Loaded: {df.shape[0]} rows, {df.shape[1]} columns")
LOG.debug(f"\n{df.head()}")
return df
|
main
Main function to run the supervised ML workflow.
Source code in src/mlstudio/app_case.py
| def main() -> None:
"""Main function to run the supervised ML workflow."""
log_header(LOG, "ML")
LOG.info("========================")
LOG.info("START main()")
LOG.info("========================")
LOG.info("Load dataset..............")
df = load_data()
LOG.info("Inspect dataset...........")
inspect_basic(df)
LOG.info("Check data quality........")
check_quality(df)
LOG.info("Create clean view.........")
df_clean = make_clean_view(df)
LOG.info("Train supervised model....")
model = train_model(df_clean)
LOG.info("Predict one case..........")
predict_example(model)
LOG.info("Create charts.............")
make_plots(df_clean, model)
LOG.info("Summarize workflow........")
summarize(df, df_clean)
LOG.info(
"----- in a script, call plt.show() once at the end to display all charts -----"
)
LOG.info(
"----- in a script, CLOSE the chart windows with the close button to CONTINUE -----"
)
plt.show()
LOG.info("Workflow complete")
LOG.info("IMPORTANT: This script creates chart windows.")
LOG.info("Close chart windows and terminate this process with CTRL+c as needed.")
LOG.info("========================")
LOG.info("Executed successfully!")
LOG.info("========================")
|
make_clean_view
make_clean_view(df: DataFrame) -> pd.DataFrame
Create a cleaned view for modeling.
Source code in src/mlstudio/app_case.py
| def make_clean_view(df: pd.DataFrame) -> pd.DataFrame:
"""Create a cleaned view for modeling."""
LOG.info("Creating clean modeling view")
selected_cols: list[str] = FEATURE_COLS + [TARGET_COL]
# Select only the columns we need.
df_selected: pd.DataFrame = df[selected_cols] # type: ignore[assignment]
# Drop rows with any missing values.
df_no_missing: pd.DataFrame = df_selected.dropna()
# Assign a copy of the no-missing DataFrame to df_clean to avoid SettingWithCopyWarning.
df_clean: pd.DataFrame = df_no_missing.copy()
LOG.info(f"Clean view: {df_clean.shape[0]} rows, {df_clean.shape[1]} columns")
return df_clean
|
make_plots
make_plots(
df_clean: DataFrame, model: LinearRegression
) -> None
Create charts for the supervised regression case.
Source code in src/mlstudio/app_case.py
| def make_plots(df_clean: pd.DataFrame, model: LinearRegression) -> None:
"""Create charts for the supervised regression case."""
LOG.info("Creating chart: hours studied vs score")
fig, ax = plt.subplots(figsize=(9, 5))
scatter_plt: Axes = sns.scatterplot(
data=df_clean,
x="hours_studied",
y=TARGET_COL,
ax=ax,
)
scatter_plt.set_title("Hours Studied vs Score (CLOSE chart to continue)")
scatter_plt.set_xlabel("Hours Studied")
scatter_plt.set_ylabel("Score")
LOG.info("Creating chart: model coefficients")
fig, ax = plt.subplots(figsize=(9, 5))
LOG.info(f"Got a figure {fig} and axes {ax} from plt.subplots().")
coefficient_df = pd.DataFrame(
{
"feature": FEATURE_COLS,
"coefficient": model.coef_,
}
).sort_values("coefficient", ascending=False)
bar_plt: Axes = sns.barplot(
data=coefficient_df,
x="coefficient",
y="feature",
ax=ax,
)
bar_plt.set_title("Model Coefficients (CLOSE chart to continue)")
bar_plt.set_xlabel("Coefficient")
bar_plt.set_ylabel("Feature")
|
predict_example
predict_example(model: LinearRegression) -> None
Use the trained model to predict one new student score.
Source code in src/mlstudio/app_case.py
| def predict_example(model: LinearRegression) -> None:
"""Use the trained model to predict one new student score."""
LOG.info("Predicting one new case")
new_case = pd.DataFrame(
[
{
"hours_studied": 6.5,
"practice_quizzes": 4,
"attendance_pct": 92,
"sleep_hours": 7.0,
"prior_score": 72,
}
]
)
predicted_score: float = model.predict(new_case)[0]
LOG.info(f"New case:\n{new_case}")
LOG.info(f"Predicted score: {predicted_score:.1f}")
|
summarize
summarize(df: DataFrame, df_clean: DataFrame) -> None
Log a brief summary.
Source code in src/mlstudio/app_case.py
| def summarize(df: pd.DataFrame, df_clean: pd.DataFrame) -> None:
"""Log a brief summary."""
LOG.info("========================")
LOG.info("SUMMARY")
LOG.info("========================")
LOG.info(f"Dataset: {DATASET_NAME}")
LOG.info(f"Original rows: {df.shape[0]}")
LOG.info(f"Clean rows: {df_clean.shape[0]}")
LOG.info(f"Features: {FEATURE_COLS}")
LOG.info(f"Target: {TARGET_COL}")
|
train_model
train_model(df_clean: DataFrame) -> LinearRegression
Train a supervised regression model.
Source code in src/mlstudio/app_case.py
| def train_model(df_clean: pd.DataFrame) -> LinearRegression:
"""Train a supervised regression model."""
LOG.info("Training LinearRegression model")
x = df_clean[FEATURE_COLS]
y = df_clean[TARGET_COL]
x_train, x_test, y_train, y_test = train_test_split(
x,
y,
test_size=TEST_SIZE,
random_state=RANDOM_STATE,
)
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
mae: float = mean_absolute_error(y_test, y_pred)
r2: float = r2_score(y_test, y_pred)
LOG.info(f"Mean absolute error: {mae:.2f}")
LOG.info(f"R-squared: {r2:.2f}")
return model
|