from __future__ import annotations

# Libreria Core del lab.
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Pre-procesamiento
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

# Metricas de evaluación
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score

# Clasificadores
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Regresores
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

#Libreria para plotear
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

pd.options.plotting.backend = "plotly"


df_players = pd.read_csv('data/stats_players.csv')
df_players


print(f"Cantidad de valores nulos: {df_players.National_Position.isna().sum()}")

Cantidad de valores nulos: 16513


print("Valores por cada clase")
counts = df_players.National_Position.value_counts()[::-1]
fig = px.bar(x=counts.values, y=counts.index)
fig.update_layout(title="Cantidad de valores por cada clase",
                  xaxis_title="Cantidad de valores",
                  yaxis_title="Clase",
                 )
fig.show()

Valores por cada clase


y = ~df_players.National_Position.isna()
y.hist()


X = df_players.drop("National_Position", axis=1)


exclude = {"Name", "Nationality", "Club Position",  # Alta cardinalidad
           "National_Position",  # Variable objetivo
          }


def get_col_index(columns: list[str], frame = X) -> list[int]:
    """Retorna los índices de las columnas dadas."""
    return list(np.where(frame.columns.isin(columns))[0])


int_index = get_col_index(X.select_dtypes("int").columns)
float_index = get_col_index(X.select_dtypes("float").columns)
object_index = get_col_index(set(X.select_dtypes("object").columns) - exclude)
club_position_index =  get_col_index(["Club_Position"])


preprocessor = ColumnTransformer([("int_processor", make_pipeline(SimpleImputer(strategy="median"), MinMaxScaler()), int_index),
                                  ("float_processor", make_pipeline(StandardScaler(), SimpleImputer(strategy="mean")), float_index),
                                  ("small_cat_processor", OneHotEncoder(handle_unknown='ignore', drop='first', sparse=False), object_index),
                                  ("large_cat_processor", make_pipeline(SimpleImputer(strategy="most_frequent"), OrdinalEncoder(), MinMaxScaler()), club_position_index),
                                 ])


linear_svc = make_pipeline(preprocessor,
                           LinearSVC())
knn = make_pipeline(preprocessor,
                    KNeighborsClassifier())
random_forest = make_pipeline(preprocessor,
                              RandomForestClassifier())
pipelines = (linear_svc, knn, random_forest)


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,
                                                    shuffle=True, stratify=y)


dummy_pred = DummyClassifier().fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, dummy_pred))

              precision    recall  f1-score   support

       False       0.94      1.00      0.97      4128
        True       0.00      0.00      0.00       269

    accuracy                           0.94      4397
   macro avg       0.47      0.50      0.48      4397
weighted avg       0.88      0.94      0.91      4397

C:\Users\David\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_classification.py:1327: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

C:\Users\David\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_classification.py:1327: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

C:\Users\David\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_classification.py:1327: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


for model in pipelines:
    y_pred = model.fit(X_train, y_train).predict(X_test)
    clf = [val for key, val in model.named_steps.items() if val != "pipeline"][-1]
    print(f"Resultados para {clf.__class__.__name__}")
    print(classification_report(y_test, y_pred))

Resultados para LinearSVC
              precision    recall  f1-score   support

       False       0.94      1.00      0.97      4128
        True       0.00      0.00      0.00       269

    accuracy                           0.94      4397
   macro avg       0.47      0.50      0.48      4397
weighted avg       0.88      0.94      0.91      4397

Resultados para KNeighborsClassifier
              precision    recall  f1-score   support

       False       0.94      1.00      0.97      4128
        True       0.46      0.06      0.11       269

    accuracy                           0.94      4397
   macro avg       0.70      0.53      0.54      4397
weighted avg       0.91      0.94      0.92      4397

Resultados para RandomForestClassifier
              precision    recall  f1-score   support

       False       0.94      1.00      0.97      4128
        True       0.57      0.09      0.15       269

    accuracy                           0.94      4397
   macro avg       0.76      0.54      0.56      4397
weighted avg       0.92      0.94      0.92      4397

ataque = ['ST', 'CF'] 
central_ataque = ['RW', 'CAM', 'LW'] 
central = ['RM', 'CM', 'LM'] 
central_defensa = ['RWB', 'CDM', 'LWB']
defensa = ['RB', 'CB', 'LB']
arquero = ['GK']


replace = {"ST": "ataque",
           "CF": "ataque",
           "RW": "central_ataque",
           "CAM": "central_ataque",
           "LW": "central_ataque",
           "RM": "central",
           "CM": "central",
           "LM": "central",
           "RWB": "central_defensa",
           "CDM": "central_defensa",
           "LWB": "central_defensa",
           "RB": "defensa",
           "CB": "defensa",
           "LB": "defensa",
           "GK": "arquero"
          }
posicion = df_players.Club_Position.replace(replace)
posicion = posicion[posicion.isin(set(replace.values()))]
X_new = X.drop("Club_Position", axis=1).iloc[posicion.index]


y_plot = posicion.value_counts()[::-1]
fig = px.histogram(x=y_plot.values, y=y_plot.index)
fig.update_layout(title="Cantidad de valores por cada clase",
                  xaxis_title="Cantidad de valores",
                  yaxis_title="Clase",
                 )
fig.show()


X_train, X_test, y_train, y_test = train_test_split(X_new, posicion, random_state=0,
                                                    shuffle=True, stratify=posicion)


int_index = get_col_index(X_new.select_dtypes("int").columns, X_new)
float_index = get_col_index(X_new.select_dtypes("float").columns, X_new)
object_index = get_col_index(set(X_new.select_dtypes("object").columns) - exclude)


dummy_pred = DummyClassifier().fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, dummy_pred))

                 precision    recall  f1-score   support

        arquero       0.00      0.00      0.00       158
         ataque       0.00      0.00      0.00       108
        central       0.00      0.00      0.00       227
 central_ataque       0.00      0.00      0.00       145
central_defensa       0.00      0.00      0.00        52
        defensa       0.30      1.00      0.46       295

       accuracy                           0.30       985
      macro avg       0.05      0.17      0.08       985
   weighted avg       0.09      0.30      0.14       985

C:\Users\David\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_classification.py:1327: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

C:\Users\David\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_classification.py:1327: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

C:\Users\David\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_classification.py:1327: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


svc_processor = ColumnTransformer([("int_processor", make_pipeline(SimpleImputer(strategy="median"), MinMaxScaler()), int_index),
                                   ("float_processor", make_pipeline(StandardScaler(), SimpleImputer(strategy="mean")), float_index),
                                   ("small_cat_processor", OneHotEncoder(handle_unknown='ignore', drop='first', sparse=False), object_index),
                                 ])
linear_svc = make_pipeline(svc_processor, LinearSVC())
y_pred = linear_svc.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred))

                 precision    recall  f1-score   support

        arquero       1.00      1.00      1.00       158
         ataque       0.76      0.86      0.81       108
        central       0.61      0.66      0.63       227
 central_ataque       0.53      0.30      0.39       145
central_defensa       0.62      0.19      0.29        52
        defensa       0.80      0.98      0.88       295

       accuracy                           0.75       985
      macro avg       0.72      0.66      0.67       985
   weighted avg       0.73      0.75      0.73       985

C:\Users\David\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\preprocessing\_encoders.py:188: UserWarning:

Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros


knn = make_pipeline(svc_processor, KNeighborsClassifier())
y_pred = knn.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred))

C:\Users\David\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\preprocessing\_encoders.py:188: UserWarning:

Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

                 precision    recall  f1-score   support

        arquero       1.00      1.00      1.00       158
         ataque       0.65      0.73      0.69       108
        central       0.50      0.56      0.52       227
 central_ataque       0.41      0.24      0.30       145
central_defensa       0.13      0.04      0.06        52
        defensa       0.74      0.88      0.81       295

       accuracy                           0.67       985
      macro avg       0.57      0.57      0.56       985
   weighted avg       0.64      0.67      0.65       985


tree_processor = ColumnTransformer([("int_processor", SimpleImputer(strategy="median", add_indicator=True), int_index),
                                   ("float_processor", SimpleImputer(strategy="mean", add_indicator=True), float_index),
                                   ("small_cat_processor", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1,
                                                                          encoded_missing_value=-2), object_index),
                                 ])
tree = make_pipeline(tree_processor, RandomForestClassifier())
y_pred = tree.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred))

                 precision    recall  f1-score   support

        arquero       1.00      1.00      1.00       158
         ataque       0.79      0.83      0.81       108
        central       0.59      0.71      0.64       227
 central_ataque       0.56      0.28      0.37       145
central_defensa       0.75      0.17      0.28        52
        defensa       0.80      0.97      0.88       295

       accuracy                           0.75       985
      macro avg       0.75      0.66      0.66       985
   weighted avg       0.74      0.75      0.73       985


X_temp = X.copy()
X_temp["Posicion"] = posicion
sueldos = pd.read_csv('data/sueldos.csv').drop("Unnamed: 0", axis=1)
df_sueldos = pd.merge(X_temp, sueldos, left_on='Name', right_on='Player')
df_sueldos


X_reg = df_sueldos.drop(["Weekly Salary", "Name", "Player", "Nationality"], axis=1)
y_reg = df_sueldos["Weekly Salary"]
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, random_state=0, shuffle=True)


int_index = get_col_index(X_reg.select_dtypes("int").columns, X_reg)
float_index = get_col_index(X_reg.select_dtypes("float").columns, X_reg)
object_index = get_col_index(set(X_reg.select_dtypes("object").columns) - exclude)
reg_processor = ColumnTransformer([("int_processor", make_pipeline(SimpleImputer(strategy="median"), MinMaxScaler()), int_index),
                                   ("float_processor", make_pipeline(StandardScaler(), SimpleImputer(strategy="mean")), float_index),
                                   ("small_cat_processor", OneHotEncoder(handle_unknown='ignore', drop='first', sparse=False), object_index),
                                 ])


models = (Ridge(), SVR(kernel="linear"))
for model in models:
    pipeline = make_pipeline(reg_processor, model)
    score = pipeline.fit(X_train, y_train).score(X_test, y_test)
    print(f"R^2 para {model.__class__.__name__}: {score}")

R^2 para Ridge: 0.2507424762140851

C:\Users\David\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\preprocessing\_encoders.py:188: UserWarning:

Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

C:\Users\David\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\preprocessing\_encoders.py:188: UserWarning:

Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

R^2 para SVR: -0.1115151394538163


models = (RandomForestRegressor(), SVR(kernel="rbf"))
for model in models:
    pipeline = make_pipeline(reg_processor, model)
    score = pipeline.fit(X_train, y_train).score(X_test, y_test)
    print(f"R^2 para {model.__class__.__name__}: {score}")

C:\Users\David\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\preprocessing\_encoders.py:188: UserWarning:

Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

R^2 para RandomForestRegressor: 0.5617814005119344
R^2 para SVR: -0.11187892479809713

C:\Users\David\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\preprocessing\_encoders.py:188: UserWarning:

Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

	Name	Nationality	National_Position	Club_Position	Height	Weight	Preffered_Foot	Age	Work_Rate	Weak_foot	...	Agility	Jumping	Heading	Shot_Power	Finishing	Long_Shots	Curve	Freekick_Accuracy	Penalties	Volleys
0	Cristiano Ronaldo	Portugal	LS	LW	185	80	Right	32	High / Low	4	...	90	95	85	92	93	90	81	76	85	88
1	Lionel Messi	Argentina	RW	RW	170	72	Left	29	Medium / Medium	4	...	90	68	71	85	95	88	89	90	74	85
2	Neymar	Brazil	LW	LW	174	68	Right	25	High / Medium	5	...	96	61	62	78	89	77	79	84	81	83
3	Luis Suárez	Uruguay	LS	ST	182	85	Right	30	High / Medium	4	...	86	69	77	87	94	86	86	84	85	88
4	Manuel Neuer	Germany	GK	GK	193	92	Right	31	Medium / Medium	4	...	52	78	25	25	13	16	14	11	47	11
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
17583	Adam Dunbar	Republic of Ireland	NaN	Sub	183	82	Right	19	Medium / Medium	1	...	27	56	14	16	13	13	11	13	15	12
17584	Dylan McGoey	Republic of Ireland	NaN	Sub	185	80	Right	19	Medium / Medium	2	...	28	53	12	17	12	11	12	13	16	12
17585	Tommy Ouldridge	England	NaN	Res	173	61	Right	18	High / Medium	2	...	54	61	41	44	28	42	35	36	42	37
17586	Mark Foden	Scotland	NaN	Sub	180	80	Right	21	Medium / Medium	3	...	34	48	15	23	14	12	13	12	24	12
17587	Barry Richardson	England	NaN	Sub	185	77	Right	47	Medium / Medium	2	...	38	51	12	13	11	16	12	11	22	12

	Name	Nationality	Club_Position	Height	Weight	Preffered_Foot	Age	Work_Rate	Weak_foot	Skill_Moves	...	Shot_Power	Finishing	Long_Shots	Curve	Freekick_Accuracy	Penalties	Volleys	Posicion	Player	Weekly Salary
0	Cristiano Ronaldo	Portugal	LW	185	80	Right	32	High / Low	4	5	...	92	93	90	81	76	85	88	central_ataque	Cristiano Ronaldo	1248536.0
1	Lionel Messi	Argentina	RW	170	72	Left	29	Medium / Medium	4	4	...	85	95	88	89	90	74	85	central_ataque	Lionel Messi	1538905.0
2	Neymar	Brazil	LW	174	68	Right	25	High / Medium	5	5	...	78	89	77	79	84	81	83	central_ataque	Neymar	797726.0
3	Luis Suárez	Uruguay	ST	182	85	Right	30	High / Medium	4	4	...	87	94	86	86	84	85	88	ataque	Luis Suárez	508923.0
4	Manuel Neuer	Germany	GK	193	92	Right	31	Medium / Medium	4	1	...	25	13	16	14	11	47	11	arquero	Manuel Neuer	326233.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1861	Phillip Menzel	Germany	Res	191	83	Right	18	Medium / Medium	3	1	...	25	5	10	12	8	15	5	NaN	Phillip Menzel	2034.0
1862	Manuel Akanji	Switzerland	Sub	187	85	Right	21	Medium / Medium	2	2	...	38	25	18	26	27	40	31	NaN	Manuel Akanji	54176.0
1863	Moritz Nicolas	Germany	Res	195	87	Right	19	Medium / Medium	2	1	...	22	9	10	17	14	17	11	NaN	Moritz Nicolas	2262.0
1864	Giacomo Satalino	Italy	Sub	188	70	Right	17	Medium / Medium	1	1	...	19	6	5	13	11	18	9	NaN	Giacomo Satalino	2827.0
1865	Nicolò Zaniolo	Italy	Res	185	80	Right	17	High / Low	3	2	...	50	49	42	54	51	49	39	NaN	Nicolò Zaniolo	28187.0

Laboratorio 7: Aprendizaje Supervisado 🔮

Cuerpo Docente:¶

Equipo: SUPER IMPORTANTE - notebooks sin nombre no serán revisados¶

Link de repositorio de GitHub: `https://github.com/johnny-godoy/laboratorios-mds/blob/main/lab%207/laboratorio_7.ipynb`¶

Temas a tratar¶

Reglas¶

Objetivos principales del laboratorio¶

Importamos librerias utiles 😸¶

1. Predicciones Futboleras¶

1.1 Predicción de Seleccionados Nacionales¶

1.1.1 Generación de Labels para la Clasificación [Sin Puntaje]¶

1.1.2 Camino a la clasificación [1 punto]¶

1.1.3 Entrenemos los pipelines [1 punto]¶

1.2 Predicción de posiciones de jugadores [2 puntos]¶

1.3 Predicción de Sueldos [2 puntos]¶

Conclusión¶

Laboratorio 7: Aprendizaje Supervisado 🔮

Cuerpo Docente:¶

Equipo: SUPER IMPORTANTE - notebooks sin nombre no serán revisados¶

Link de repositorio de GitHub: https://github.com/johnny-godoy/laboratorios-mds/blob/main/lab%207/laboratorio_7.ipynb¶

Temas a tratar¶

Reglas¶

Objetivos principales del laboratorio¶

Importamos librerias utiles 😸¶

1. Predicciones Futboleras¶

1.1 Predicción de Seleccionados Nacionales¶

1.1.1 Generación de Labels para la Clasificación [Sin Puntaje]¶

1.1.2 Camino a la clasificación [1 punto]¶

1.1.3 Entrenemos los pipelines [1 punto]¶

1.2 Predicción de posiciones de jugadores [2 puntos]¶

1.3 Predicción de Sueldos [2 puntos]¶

Conclusión¶

Link de repositorio de GitHub: `https://github.com/johnny-godoy/laboratorios-mds/blob/main/lab%207/laboratorio_7.ipynb`¶