import datetime

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.exceptions import NotFittedError
from sklearn.impute import SimpleImputer
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

from IPython.display import HTML


df_retail = pd.read_pickle("data/online_retail_II_cleaned.pickle")
df_retail = df_retail.astype(
    {
        "Invoice": "category",
        "StockCode": "category",
        "Description": "category",
        "Description": str,
        "Customer ID": "category",
        "Country": "category"
    }
)
df_retail.head()


def custom_features(dataframe_in: pd.DataFrame, m: int = None) -> pd.DataFrame:
    """Retorna un frame con las características del modelo LRMFP.

    Parametros
    ----------
    dataframe_in: pd.DataFrame
        frame con los datos de retail.
    m : int, optional
        Número de visitas consideradas para calcular recencia.
        Si es None, entonces se calcula con el máximo entre 1 y
        la frecuencia mínima.

    Retorna
    -------
    df_out: pd.DataFrame
        frame con las características LRMFP."""
    dataframe_copy = dataframe_in[["Customer ID", "InvoiceDate"]].copy()
    dataframe_copy["prod"] = dataframe_in.Price*dataframe_in.Quantity
    df_out = pd.DataFrame(index=dataframe_in["Customer ID"].unique().sort_values())
    by_customer = dataframe_copy.sort_values(by="InvoiceDate").groupby("Customer ID")

    dates = by_customer.InvoiceDate
    df_out["Length"] = (dates.max() - dates.min()).dt.days

    freq = dates.count()
    if m is None:
        m = max(1, freq.min())
    last_date = df_retail.InvoiceDate.max() + datetime.timedelta(days=1)
    df_out["Recency"] = (last_date - by_customer.tail(m).groupby("Customer ID").InvoiceDate.mean()).dt.days

    df_out["Frecuency"] = freq
    df_out["Monetary"] = by_customer["prod"].mean()

    dataframe_copy["diff_time"] = dates.diff().dt.days
    df_out["Periodicity"] = dataframe_copy.groupby("Customer ID").diff_time.std()

    return df_out


custom_features(df_retail)


class MinMax(BaseEstimator, TransformerMixin):
    """Escala y transforma cada característica tal que quede en el rango [0, 1]
    en el conjunto de entrenamiento. Si una feature es constante, se vuelve nula."""
    def fit(self, X, y=None):
        """Calcula el mínimo y máximo para ser usado en escalamiento.

        Parametros
        ----------
        X: array-like de forma (n_muestras, n_características)
            Los datos usados para calcular el mínimo y máximo por característica
            que se usan para escalar.
        y: None
            Ignorado, por compatibilidad.

        Retorna
        -------
        self: MinMax
            El escalador entrenado."""
        self.min_ = np.nanmin(X, axis=0)
        self.denominator_ = np.nanmax(X, axis=0) - self.min_
        self.denominator_[self.denominator_ == 0.] = 1.  # Evitando dividir por 0
        return self

    def transform(self, X):
        """Escala las características de X según el mínimo y máximo encontrado.

        Parametros
        ----------
        X: array-like de forma (n_muestras, n_características)
            Datos de entrada a ser transformados.

        Retorna
        -------
        Xt: array-like de forma (n_muestras, n_características)
            Datos transformados.

        Levanta
        -------
        NotFittedError: Si es que el modelo no ha llamado al método fit antes."""
        try:
            Xt = (X - self.min_)/self.denominator_
            return Xt
        except AttributeError:
            raise NotFittedError("Esta instancia de MinMax no está entrenada todavía. "
                                 "Llame a `fit` con los argumentos apropiados antes de usar este transformador")


preprocessing = Pipeline(steps=[("feature_transformer", FunctionTransformer(custom_features)),
                                ("imputer", SimpleImputer(strategy="mean")),
                                ("scaler", MinMax()),  # No se necesita ColumnTransformer pues todo se escala
                               ],
                        )
dimensionality_reduction = Pipeline(steps=[("preprocessing", preprocessing),
                                           ("reducer", TSNE(n_components=2, random_state=0,
                                                            learning_rate="auto", init="random",
                                                           )
                                           ),
                                          ],
                                   )
X_reduced = pd.DataFrame(dimensionality_reduction.fit_transform(df_retail), columns=["x", "y"])
X_reduced


fig = px.scatter(X_reduced, x="x", y="y")
fig.update_layout(title="Espacio de dimensión reducida con T-SNE")


def process_k_means(n_clusters: int) -> Pipeline:
    """Crea un pipeline con el mismo preprocesamiento anterior,
    agregando un paso de clustering KMeans

    Parametros
    ----------
    n_clusters: int
        Cantidad de clusters que busca KMeans

    Retorna
    -------
    clst: Pipeline
        Pipeline de clustering."""
    clst = Pipeline(steps=[("preprocessing", preprocessing),
                           ("cluster", KMeans(n_clusters=n_clusters, random_state=0)),
                          ],
                   ).fit(df_retail)
    return clst


kvals = np.arange(1, 20)
clustering_models = [process_k_means(k) for k in kvals]
inertias = [clst.named_steps["cluster"].inertia_ for clst in clustering_models]


fig = px.line(x=kvals, y=inertias)
fig.update_layout(title="Método del codo para determinar cantidad de clusters",
                  xaxis_title="Cantidad de clusters", yaxis_title="Inercia")
fig.show()


optimal_k = 3


optimal_model = clustering_models[optimal_k - 1]
df_clusters = custom_features(df_retail)
labels = optimal_model.named_steps["cluster"].labels_
df_clusters["Cluster"] = labels
by_cluster = df_clusters.groupby("Cluster")


agrupaciones = by_cluster.agg(["mean", "median"])
agrupaciones["cluster size"] = by_cluster.size()
agrupaciones


X_reduced_with_cluster = X_reduced.copy()
X_reduced_with_cluster["Cluster"] = labels
X_reduced_with_cluster["Cluster"] = X_reduced_with_cluster.Cluster.astype("category")
fig = px.scatter(X_reduced_with_cluster, x="x", y="y", color="Cluster")
fig.update_layout(title="Espacio de dimensión reducida con T-SNE")

	Invoice	StockCode	Description	Quantity	InvoiceDate	Price	Customer ID	Country
0	489434	85048	15CM CHRISTMAS GLASS BALL 20 LIGHTS	12	2009-12-01 07:45:00	6.95	13085.0	United Kingdom
1	489434	79323P	PINK CHERRY LIGHTS	12	2009-12-01 07:45:00	6.75	13085.0	United Kingdom
2	489434	79323W	WHITE CHERRY LIGHTS	12	2009-12-01 07:45:00	6.75	13085.0	United Kingdom
3	489434	22041	RECORD FRAME 7" SINGLE SIZE	48	2009-12-01 07:45:00	2.10	13085.0	United Kingdom
4	489434	21232	STRAWBERRY CERAMIC TRINKET BOX	24	2009-12-01 07:45:00	1.25	13085.0	United Kingdom

Customer ID	Length	Recency	Frequency	Monetary	Periodicity
12346.0	294	67	46	-64.68	37.0
12347.0	37	3	71	1323.32	0.0
12349.0	327	43	107	2646.99	78.0
12352.0	16	11	18	343.80	0.0
12356.0	44	16	84	3562.25	12.0

	Length	Recency	Frecuency	Monetary	Periodicity
12346.0	196	165	33	11.298788	21.724076
12347.0	37	3	71	18.638310	4.422346
12348.0	0	74	20	11.108000	0.000000
12349.0	181	43	102	26.187647	16.200990
12351.0	0	11	21	14.330000	0.000000
...	...	...	...	...	...
18283.0	275	18	217	2.854240	11.783701
18284.0	0	67	28	16.488571	0.000000
18285.0	0	296	12	35.583333	0.000000
18286.0	247	112	67	19.349701	30.403598
18287.0	188	18	85	27.596588	15.299909

	x	y
0	8.378868	-42.816170
1	-1.798583	21.489256
2	-71.841568	8.571382
3	21.666998	-18.503113
4	-9.306869	39.164833
...	...	...
4309	53.665306	-13.795135
4310	-68.078026	20.554010
4311	-33.021099	52.639996
4312	24.813828	-38.750008
4313	21.537745	-6.155716

	Length		Recency		Frecuency		Monetary		Periodicity		cluster size
	mean	median	mean	median	mean	median	mean	median	mean	median
Cluster
0	23.645464	0.0	251.627737	245.0	28.454640	19.0	53.258432	16.870000	3.940067	0.000000	959
1	277.280495	278.0	37.318156	25.0	167.035413	97.0	32.357707	18.077614	21.125143	15.854776	1779
2	39.241117	0.0	54.590736	46.0	48.542513	29.0	32.311214	16.788990	5.395593	0.397565	1576

Laboratorio 6: La desperación de Mr. Lepin 🐼

Cuerpo Docente:¶

Equipo: SUPER IMPORTANTE - notebooks sin nombre no serán revisados¶

Link de repositorio de GitHub: `https://github.com/johnny-godoy/laboratorios-mds/blob/main/lab%206/laboratorio_6.ipynb`¶

Indice¶

Temas a tratar¶

Reglas:¶

Objetivos principales del laboratorio¶

Descripción del laboratorio.¶

Importamos librerias utiles 😸¶

Segmentación de Clientes en Tienda de Retail 🛍️¶

1.1 Cargar Dataset¶

1.2 Creación de nuevas Caracteristicas [2 Puntos]¶

1.3 Pipelines 👷¶

1.3.1 Estandarizar Caracteristicas [0.5 puntos]¶

1.3.2 `T-SNE` Pipeline [1.0 puntos]¶

1.3.3 Clustering¶

1.3.3.1 Método del Codo [1 puntos]¶

1.3.3.2 Segmentación de Clientes con K-Means 🎁 [1 punto]¶

1.3.3.3 Plot de K-Means 📈 [0.5 puntos]¶

Conclusión¶

	Length	Recency	Frequency	Monetary	Periodicity
Cluster
0	258.8	45.2	76.1	1107.7	107.6	449
1	76.1	217.6	45.5	791.7	14.1	466
2	368.5	4.8	2715.0	226621.6	4.2	4
3	85.3	45.7	65.8	1047.0	10.5	987
4	347.2	15.9	1658.0	35829.3	8.0	25
5	298.0	29.8	183.8	3639.9	32.0	1188

Laboratorio 6: La desperación de Mr. Lepin 🐼

Cuerpo Docente:¶

Equipo: SUPER IMPORTANTE - notebooks sin nombre no serán revisados¶

Link de repositorio de GitHub: https://github.com/johnny-godoy/laboratorios-mds/blob/main/lab%206/laboratorio_6.ipynb¶

Indice¶

Temas a tratar¶

Reglas:¶

Objetivos principales del laboratorio¶

Descripción del laboratorio.¶

Importamos librerias utiles 😸¶

Segmentación de Clientes en Tienda de Retail 🛍️¶

1.1 Cargar Dataset¶

1.2 Creación de nuevas Caracteristicas [2 Puntos]¶

1.3 Pipelines 👷¶

1.3.1 Estandarizar Caracteristicas [0.5 puntos]¶

1.3.2 T-SNE Pipeline [1.0 puntos]¶

1.3.3 Clustering¶

1.3.3.1 Método del Codo [1 puntos]¶

1.3.3.2 Segmentación de Clientes con K-Means 🎁 [1 punto]¶

1.3.3.3 Plot de K-Means 📈 [0.5 puntos]¶

Conclusión¶

Link de repositorio de GitHub: `https://github.com/johnny-godoy/laboratorios-mds/blob/main/lab%206/laboratorio_6.ipynb`¶

1.3.2 `T-SNE` Pipeline [1.0 puntos]¶