import datetime

from IPython.display import display, Markdown, Latex
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
from pandas.api.types import is_numeric_dtype
from pandas.core.dtypes.common import is_datetime_or_timedelta_dtype
import pandas as pd
import plotly.express as px
from scipy import stats


df_retail = pd.read_pickle("data/online_retail_II.pickle")
df_retail = df_retail.astype(
    {
        "Invoice": "category",
        "StockCode": "category",
        "Description": str,
        "Customer ID": "category",
    }
)
df_retail.head()


df_no_duplicates_or_nans = df_retail.drop_duplicates().dropna()
df_no_duplicates_or_nans.groupby(["Invoice", "StockCode"]).size()

Invoice  StockCode   
489434   10002           0
         10080           0
         10109           0
         10120           0
         10125           0
                        ..
C538164  gift_0001_60    0
         gift_0001_70    0
         gift_0001_80    0
         gift_0001_90    0
         m               0
Length: 133475712, dtype: int64


df_retail.dtypes

Invoice              category
StockCode            category
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
Price                 float64
Customer ID          category
Country                object
dtype: object


for nombre, columna in df_retail.select_dtypes(["category", "object"]).items():
    print(f"Cantidad de valores únicos de {nombre}: {columna.nunique()}")

for nombre, columna in df_retail.select_dtypes(["int64", "datetime64[ns]", "float64"]).items():
    print(f"Rando de valores para {nombre}: {[columna.min(), columna.max()]}")

Cantidad de valores únicos de Invoice: 28816
Cantidad de valores únicos de StockCode: 4632
Cantidad de valores únicos de Description: 4682
Cantidad de valores únicos de Customer ID: 4383
Cantidad de valores únicos de Country: 40
Rando de valores para Quantity: [-9600, 19152]
Rando de valores para InvoiceDate: [Timestamp('2009-12-01 07:45:00'), Timestamp('2010-12-09 20:01:00')]
Rando de valores para Price: [-53594.36, 25111.09]


def profile_serie(serie_in, n_samples=1000, random_state=42):
    serie = serie_in.copy()

    profile = pd.Series(dtype='object')
    profile["Type"] = serie.dtype
    profile = pd.concat([profile, serie.describe(datetime_is_numeric=True)])

    # profile = pd.Series([])

    if is_numeric_dtype(serie):
        profile["Negative"] = (serie < 0).sum()
        profile["Negative (%)"] = (
            str(round((serie < 0).sum() / len(serie) * 100, 2)) + " %"
        )
        profile["Zeros"] = (serie == 0).sum()
        profile["Zeros (%)"] = (
            str(round((serie == 0).sum() / len(serie) * 100, 2)) + " %"
        )
        profile["Kurt"] = serie.kurt()
        profile["Skew"] = serie.skew()

    profile[" "] = " "  # espacio

    profile["Missing cells"] = serie.isnull().sum()
    profile["Missing cells (%)"] = (
        str(round(serie.isnull().sum() / len(serie) * 100, 2)) + " %"
    )
    profile["Duplicate rows"] = serie.duplicated(False).sum()
    profile["Duplicate rows (%)"] = (
        str(round(serie.duplicated(False).sum() / len(serie) * 100, 2)) + " %"
    )
    profile["Total size in memory"] = str(serie.memory_usage(index=True)) + " bytes"

    # profile = pd.concat([profile, description])

    profile = profile.rename(
        index={
            "count": "Number of observations",
            "mean": "Mean",
            "std": "Std",
            "min": "Min",
            "max": "Max",
            "unique": "Unique",
            "top": "Top",
            "freq": "Freq",
        }
    )
    no_outliers_fig = None

    if is_numeric_dtype(serie):

        sampled_serie = serie.sample(n_samples, random_state=random_state)
        fig = px.histogram(
            sampled_serie, marginal="box", title=f"{serie.name} - With Outliers"
        )

        no_outliers = sampled_serie.loc[(np.abs(stats.zscore(sampled_serie)) < 3)]
        # zscore = https://es.wikipedia.org/wiki/Unidad_tipificada

        no_outliers_fig = px.histogram(
            no_outliers, marginal="box", title=f"{serie.name} - Without Outliers"
        )

    elif is_datetime_or_timedelta_dtype(serie):
        sampled_serie = serie.sample(n_samples, random_state=random_state)
        fig = px.histogram(sampled_serie, marginal="box", title=f"{serie.name}")

    else:
        count = (
            serie.value_counts()[0:100]
            .reset_index()
            .rename(columns={"index": serie.name, serie.name: "Count"})
        )
        fig = px.bar(
            x=count[serie.name].astype(str),
            y=count["Count"],
            title=f"100 Most common categories of {serie.name}",
        )
    display(Markdown(f'## {serie.name} Profile'))
    display(profile)
    fig.show()

    if no_outliers_fig:
        no_outliers_fig.show()

    # return fig, profile


profile_serie(df_retail['Invoice'])

Type                           category
Number of observations           525461
Unique                            28816
Top                              537434
Freq                                675
                                       
Missing cells                         0
Missing cells (%)                 0.0 %
Duplicate rows                   517456
Duplicate rows (%)              98.48 %
Total size in memory      2338386 bytes
dtype: object


profile_serie(df_retail.StockCode)

Type                           category
Number of observations           525461
Unique                             4632
Top                              85123A
Freq                               3516
                                       
Missing cells                         0
Missing cells (%)                 0.0 %
Duplicate rows                   525026
Duplicate rows (%)              99.92 %
Total size in memory      1220242 bytes
dtype: object


profile_serie(df_retail.Description)

Type                                                  object
Number of observations                                525461
Unique                                                  4682
Top                       WHITE HANGING HEART T-LIGHT HOLDER
Freq                                                    3549
                                                            
Missing cells                                              0
Missing cells (%)                                      0.0 %
Duplicate rows                                        525198
Duplicate rows (%)                                   99.95 %
Total size in memory                           4203816 bytes
dtype: object


profile_serie(df_retail.Quantity)

Type                              int64
Number of observations         525461.0
Mean                          10.337667
Std                           107.42411
Min                             -9600.0
25%                                 1.0
50%                                 3.0
75%                                10.0
Max                             19152.0
Negative                          12326
Negative (%)                     2.35 %
Zeros                                 0
Zeros (%)                         0.0 %
Kurt                        6277.666908
Skew                          36.044617
                                       
Missing cells                         0
Missing cells (%)                 0.0 %
Duplicate rows                   525122
Duplicate rows (%)              99.94 %
Total size in memory      4203816 bytes
dtype: object


profile_serie(df_retail.InvoiceDate)

Type                                     datetime64[ns]
Number of observations                           525461
Mean                      2010-06-28 11:37:36.845017856
Min                                 2009-12-01 07:45:00
25%                                 2010-03-21 12:20:00
50%                                 2010-07-06 09:51:00
75%                                 2010-10-15 12:45:00
Max                                 2010-12-09 20:01:00
                                                       
Missing cells                                         0
Missing cells (%)                                 0.0 %
Duplicate rows                                   520400
Duplicate rows (%)                              99.04 %
Total size in memory                      4203816 bytes
dtype: object


profile_serie(df_retail.Price)

Type                            float64
Number of observations         525461.0
Mean                           4.688834
Std                          146.126914
Min                           -53594.36
25%                                1.25
50%                                 2.1
75%                                4.21
Max                            25111.09
Negative                              3
Negative (%)                      0.0 %
Zeros                              3687
Zeros (%)                         0.7 %
Kurt                       64868.344873
Skew                        -140.768446
                                       
Missing cells                         0
Missing cells (%)                 0.0 %
Duplicate rows                   524485
Duplicate rows (%)              99.81 %
Total size in memory      4203816 bytes
dtype: object


profile_serie(df_retail["Customer ID"])

Type                           category
Number of observations         417534.0
Unique                           4383.0
Top                             14911.0
Freq                             5710.0
                                       
Missing cells                    107927
Missing cells (%)               20.54 %
Duplicate rows                   525327
Duplicate rows (%)              99.97 %
Total size in memory      1218250 bytes
dtype: object


profile_serie(df_retail.Country)

Type                              object
Number of observations            525461
Unique                                40
Top                       United Kingdom
Freq                              485852
                                        
Missing cells                          0
Missing cells (%)                  0.0 %
Duplicate rows                    525461
Duplicate rows (%)               100.0 %
Total size in memory       4203816 bytes
dtype: object


def profile_df(dataframe_in):
    df = dataframe_in.copy()

    list_type = []
    for col in list(df.columns):
        if is_numeric_dtype(df[col]) or \
        pd.core.dtypes.common.is_datetime_or_timedelta_dtype(df[col]):
            list_type.append(col)

    display(Markdown('## Bivariant Analysis:'))
    for i in range(len(list_type)):
        for j in range(i+1, len(list_type)):
            plt.scatter(df[list_type[i]], df[list_type[j]])
            plt.xlabel(list_type[i]) 
            plt.ylabel(list_type[j]) 
            plt.title(f"{list_type[i]} v/s {list_type[j]}")
            plt.show()

    display(Markdown('## Correlation:'))
    fig_corr = px.imshow(df.corr())
    fig_corr.show()

    display(Markdown('## Missing Matrix:'))
    fig, ax = plt.subplots(figsize=[15, 10])
    msno.matrix(df, ax=ax, sparkline=False)


df_sorted = df_retail.sort_values("InvoiceDate")
profile_df(df_sorted)

C:\Users\David\AppData\Local\Temp\ipykernel_14496\4178638371.py:20: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.


def limpiar_datos(datos_sucios: pd.DataFrame) -> pd.DataFrame:
    datos_limpios = datos_sucios.dropna().drop_duplicates()  # Eliminando filas duplicadas o con nulos
    # Eliminando cantidades y precios no positivos
    datos_limpios = datos_limpios[datos_limpios.Quantity > 0]
    datos_limpios = datos_limpios[datos_limpios.Price > 0]
    return datos_limpios


df_clean = limpiar_datos(df_sorted)
df_clean.head(5)


for column in df_clean:
    profile_serie(df_clean[column])

Type                           category
Number of observations           400916
Unique                            19213
Top                              500356
Freq                                251
                                       
Missing cells                         0
Missing cells (%)                 0.0 %
Duplicate rows                   399186
Duplicate rows (%)              99.57 %
Total size in memory      5296496 bytes
dtype: object

Type                           category
Number of observations           400916
Unique                             4017
Top                              85123A
Freq                               3107
                                       
Missing cells                         0
Missing cells (%)                 0.0 %
Duplicate rows                   400754
Duplicate rows (%)              99.96 %
Total size in memory      4178352 bytes
dtype: object

Type                                                  object
Number of observations                                400916
Unique                                                  4444
Top                       WHITE HANGING HEART T-LIGHT HOLDER
Freq                                                    3107
                                                            
Missing cells                                              0
Missing cells (%)                                      0.0 %
Duplicate rows                                        400710
Duplicate rows (%)                                   99.95 %
Total size in memory                           6414656 bytes
dtype: object

Type                              int64
Number of observations         400916.0
Mean                          13.767418
Std                           97.638385
Min                                 1.0
25%                                 2.0
50%                                 5.0
75%                                12.0
Max                             19152.0
Negative                              0
Negative (%)                      0.0 %
Zeros                                 0
Zeros (%)                         0.0 %
Kurt                        9418.363882
Skew                          79.281875
                                       
Missing cells                         0
Missing cells (%)                 0.0 %
Duplicate rows                   400802
Duplicate rows (%)              99.97 %
Total size in memory      6414656 bytes
dtype: object

Type                                     datetime64[ns]
Number of observations                           400916
Mean                      2010-07-01 05:01:16.167027712
Min                                 2009-12-01 07:45:00
25%                                 2010-03-26 13:28:00
50%                                 2010-07-09 10:26:00
75%                                 2010-10-14 13:58:45
Max                                 2010-12-09 20:01:00
                                                       
Missing cells                                         0
Missing cells (%)                                 0.0 %
Duplicate rows                                   399449
Duplicate rows (%)                              99.63 %
Total size in memory                      6414656 bytes
dtype: object

Type                            float64
Number of observations         400916.0
Mean                           3.305826
Std                           35.047719
Min                               0.001
25%                                1.25
50%                                1.95
75%                                3.75
Max                             10953.5
Negative                              0
Negative (%)                      0.0 %
Zeros                                 0
Zeros (%)                         0.0 %
Kurt                       62818.874688
Skew                         233.142978
                                       
Missing cells                         0
Missing cells (%)                 0.0 %
Duplicate rows                   400730
Duplicate rows (%)              99.95 %
Total size in memory      6414656 bytes
dtype: object

Type                           category
Number of observations         400916.0
Unique                           4312.0
Top                             14911.0
Freq                             5568.0
                                       
Missing cells                         0
Missing cells (%)                 0.0 %
Duplicate rows                   400825
Duplicate rows (%)              99.98 %
Total size in memory      4176360 bytes
dtype: object

Type                              object
Number of observations            400916
Unique                                37
Top                       United Kingdom
Freq                              364233
                                        
Missing cells                          0
Missing cells (%)                  0.0 %
Duplicate rows                    400916
Duplicate rows (%)               100.0 %
Total size in memory       6414656 bytes
dtype: object


profile_df(df_clean)

C:\Users\David\AppData\Local\Temp\ipykernel_14496\4178638371.py:20: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.


df_clean["Income"] = df_clean.Price*df_clean.Quantity
product_gains = df_clean.groupby("Description")[["Quantity", "Income"]].sum()
product_gains.rename(columns={"Quantity": "SoldQuantity"}, inplace=True)
product_gains.reset_index(inplace=True)
product_gains.Description = product_gains.Description.str.lower()
product_gains


top = product_gains.nlargest(columns="SoldQuantity", n=30)[::-1]
px.bar(top, y="Description", x="SoldQuantity")


top = product_gains.nlargest(columns="Income", n=30)[::-1]
px.bar(top, y="Description", x="Income")


def plot_ventas(dataframe):
    df_base = dataframe[["InvoiceDate", "Quantity"]].copy()
    time_period = (("d", "Daily"),
                   ("7d", "Weekly"),
                   ("30d", "Monthly"),
                   ("365d", "Yearly")
                  )
    for row, (freq, freq_name) in enumerate(time_period, 1):
        df_copy = df_base.copy()
        df_copy.InvoiceDate = df_copy.InvoiceDate.dt.round(freq=freq)
        df_grouped = df_copy.groupby("InvoiceDate").Quantity.sum().reset_index()
        fig = px.bar(df_grouped, x="InvoiceDate", y="Quantity")
        fig.update_layout(title_text=f"{freq_name} sales")
        fig.show()


plot_ventas(df_clean)

	Description	SoldQuantity	Income
0	doormat union jack guns and roses	167	1071.25
1	3 stripey mice feltcraft	662	1241.10
2	4 purple flock dinner candles	200	265.20
3	animal stickers	385	80.85
4	black pirate treasure chest	45	74.25
...	...	...	...
4439	zinc heart lattice tray oval	325	364.15
4440	zinc metal heart decoration	13771	16472.75
4441	zinc police box lantern	193	783.70
4442	zinc top 2 door wooden shelf	233	1325.35
4443	zinc willie winkie candle stick	3626	3007.22

Laboratorio 5: El Pandas no Muerde (act IV) 🐼

Cuerpo Docente:¶

Equipo: SUPER IMPORTANTE - notebooks sin nombre no serán revisados¶

Link de repositorio de GitHub: `https://github.com/johnny-godoy/laboratorios-mds/blob/main/lab%205/laboratorio_5.ipynb`¶

Reglas:¶

Objetivos principales del laboratorio¶

Descripción del laboratorio.¶

Importamos librerias utiles 😸¶

Segmentación de Clientes en Tienda de Retail 🛍️¶

1.1 Cargar Dataset¶

1.2 Analisís Explotatorio de los Datos [0.5 puntos]¶

1.2.1 Análisis Univariado [2 Puntos]¶

Invoice Profile¶

StockCode Profile¶

Description Profile¶

Quantity Profile¶

InvoiceDate Profile¶

Price Profile¶

Customer ID Profile¶

Country Profile¶

1.2.2 Análisis Multivariado y Datos Faltantes [1 ptos]¶

Bivariant Analysis:¶

Correlation:¶

Missing Matrix:¶

1.2.3 Limpieza de Datos [1 pto]¶

Invoice Profile¶

StockCode Profile¶

Description Profile¶

Quantity Profile¶

InvoiceDate Profile¶

Price Profile¶

Customer ID Profile¶

Country Profile¶

Bivariant Analysis:¶

Correlation:¶

Missing Matrix:¶

1.2.4 Obtención de TOPs [0.75 ptos]¶

1.2.5 Visualización del registro temporal [0,75 ptos]¶

Conclusión¶

	Invoice	StockCode	Description	Quantity	InvoiceDate	Price	Customer ID	Country
0	489434	85048	15CM CHRISTMAS GLASS BALL 20 LIGHTS	12	2009-12-01 07:45:00	6.95	13085.0	United Kingdom
1	489434	79323P	PINK CHERRY LIGHTS	12	2009-12-01 07:45:00	6.75	13085.0	United Kingdom
2	489434	79323W	WHITE CHERRY LIGHTS	12	2009-12-01 07:45:00	6.75	13085.0	United Kingdom
3	489434	22041	RECORD FRAME 7" SINGLE SIZE	48	2009-12-01 07:45:00	2.10	13085.0	United Kingdom
4	489434	21232	STRAWBERRY CERAMIC TRINKET BOX	24	2009-12-01 07:45:00	1.25	13085.0	United Kingdom

Atributo	Tipo de Datos	Rango	Explicación
Invoice	category	28816 códigos	Identificador de la boleta
StockCode	category	4632 códigos	Código del producto
Description	object	4682 descripciones	Descripción del producto
Quantity	int64	[-9600, 19152]	Cantidad transaccionada
InvoiceDate	datetime64[ns]	01/12/2009 - 09/12/2010	Fecha de la transacción
Price	float64	[-53594.36, 25111.09]	Precio del producto
Customer ID	category	4383 códigos	Identificador del cliente
Country	object	40 países	País de la transacción

Laboratorio 5: El Pandas no Muerde (act IV) 🐼

Cuerpo Docente:¶

Equipo: SUPER IMPORTANTE - notebooks sin nombre no serán revisados¶

Link de repositorio de GitHub: https://github.com/johnny-godoy/laboratorios-mds/blob/main/lab%205/laboratorio_5.ipynb¶

Reglas:¶

Objetivos principales del laboratorio¶

Descripción del laboratorio.¶

Importamos librerias utiles 😸¶

Segmentación de Clientes en Tienda de Retail 🛍️¶

1.1 Cargar Dataset¶

1.2 Analisís Explotatorio de los Datos [0.5 puntos]¶

1.2.1 Análisis Univariado [2 Puntos]¶

Invoice Profile¶

StockCode Profile¶

Description Profile¶

Quantity Profile¶

InvoiceDate Profile¶

Price Profile¶

Customer ID Profile¶

Country Profile¶

1.2.2 Análisis Multivariado y Datos Faltantes [1 ptos]¶

Bivariant Analysis:¶

Correlation:¶

Missing Matrix:¶

1.2.3 Limpieza de Datos [1 pto]¶

Invoice Profile¶

StockCode Profile¶

Description Profile¶

Quantity Profile¶

InvoiceDate Profile¶

Price Profile¶

Customer ID Profile¶

Country Profile¶

Bivariant Analysis:¶

Correlation:¶

Missing Matrix:¶

1.2.4 Obtención de TOPs [0.75 ptos]¶

1.2.5 Visualización del registro temporal [0,75 ptos]¶

Conclusión¶

Link de repositorio de GitHub: `https://github.com/johnny-godoy/laboratorios-mds/blob/main/lab%205/laboratorio_5.ipynb`¶