import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


def load_data(path: str = "Customer_Churn.csv") -> pd.DataFrame:
    """Load the churn dataset from a project-local CSV."""
    data = pd.read_csv(path)
    print("Loaded data. Shape:", data.shape)
    return data


def build_model(data: pd.DataFrame) -> None:
    print("\nTarget distribution:")
    print(data["Churn"].value_counts(normalize=True))

    print("\nMissing values per column:")
    print(data.isnull().sum())

    data = data.copy()
    data["Churn"] = data["Churn"].replace({"No": 0, "Yes": 1})

    print("\nAverage tenure by churn group:")
    print(data.groupby("Churn")["tenure"].mean())

    X = data.drop(columns=["customerID", "Churn"])
    y = data["Churn"]

    if "TotalCharges" in X.columns:
        X["TotalCharges"] = X["TotalCharges"].replace(" ", np.nan)
        X["TotalCharges"] = pd.to_numeric(X["TotalCharges"], errors="coerce")
        X["TotalCharges"] = X["TotalCharges"].fillna(X["TotalCharges"].median())

    numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    cat_cols = X.select_dtypes(include=["object", "bool"]).columns.tolist()

    preprocess = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
            ("num", "passthrough", numeric_cols),
        ]
    )

    pipe = Pipeline(
        steps=[
            ("prep", preprocess),
            ("model", LinearRegression()),
        ]
    )

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=42,
        stratify=y,
    )

    pipe.fit(X_train, y_train)
    pred = np.clip(pipe.predict(X_test), 0, 1)

    rmse = np.sqrt(mean_squared_error(y_test, pred))
    auc = roc_auc_score(y_test, pred)

    print("\nModel results: Linear Regression baseline")
    print("RMSE:", round(rmse, 4))
    print("AUC:", round(auc, 4))


if __name__ == "__main__":
    build_model(load_data())
