# Libreria Core del lab.
import numpy as np
import pandas as pd
from IPython.display import display

#Libreria para plotear
# !pip install --upgrade plotly
import plotly.express as px

# Librerias utiles
from sklearn.preprocessing import StandardScaler

[
  {
    "names": "Rita Courtney",
    "gender": "female",
    "race/ethnicity": "group B",
    "parental level of education": "some high school",
    "lunch": "standard",
    "test preparation course": "none",
    "math score": 37,
    "reading score": 46,
    "writing score": 46
  },
  {
    "names": "Madeline Fuller",
    "gender": "female",
    "race/ethnicity": "group C",
    "parental level of education": "some high school",
    "lunch": "standard",
    "test preparation course": "none",
    "math score": 124,
    "reading score": 142,
    "writing score": 142
  }
]


df_grades = pd.read_json("students_grades.json")
# Convirtiendo a tipos apropiados
columnas_notas = [f"{subject} score" for subject in ("math", "reading", "writing")]
# Reemplazando los datos vacíos con np.nan, y transfdormando los datos a numéricos
df_grades[columnas_notas] = df_grades[columnas_notas].replace("", np.nan).astype(float)


def exploratory_data_analysis(dataframe):
    # Imprimir un mensaje con las dimensiones del Dataframe.
    filas, columnas = dataframe.shape
    print(f"1.- Los datos tienen {filas} filas y {columnas} columnas")

    # Imprimir un mensaje con los nombres de las columnas que conforman el DataFrame.
    print(f"2.- Los datos tienen las siguientes columnas: {list(dataframe.columns)}")

    # Mostrar los primeros 5, los últimos 5 elementos del DataFrame y realice un muestreo de 5 elementos de forma aleatoria.
    print("3.- Ejemplos de filas")
    print("Los 5 primeros datos son:")
    display(dataframe.head(5))

    print("Los 5 últimos datos son:")
    display(dataframe.tail(5))

    print("Muestra de 5 datos aleatorios:")
    display(dataframe.sample(5))

    # Mostrar una descripción rápida de las variables numéricas del DataFrame (utilice un método visto en clases).
    print("4.- Descripción de variables numéricas")
    display(dataframe.describe())

    # Imprimir un mensaje con la cantidad de valores nulos (investigue el método `isna()` y `sum()` de los DataFrames).
    print(f"5.- La cantidad de valores nulos para cada atributo son:\n{dataframe.isna().sum()}")

    # Imprimir un mensaje que detalle la cantidad de valores unicos (investigue `nunique()`) .
    print(f"6.- La cantidad de valores únicos para cada atributo son:\n{dataframe.nunique()}")

    # Imprimir un mensaje con el total de filas que contengan nombres duplicados (investigue `duplicated()` y su parámetro `subset`).
    mascara_duplicados = dataframe.duplicated(subset="names")
    print(f"7.- La cantidad de valores duplicados es {mascara_duplicados.sum()}")

    # Mostrar las filas que contengan nombres sean duplicados (investigue `duplicated()` y su parámetro `subset`).
    print(f"8.- Los datos de los nombres duplicados son:")
    display(dataframe[mascara_duplicados])

    # A traves de `.plot.box()` plotear el boxplot de las notas. De ser necesario, invetigue como funcionan los gráficos de caja (esta sección es provista).
    print(f"9.- Gráfico de caja:")
    dataframe.plot.box()


exploratory_data_analysis(df_grades)

1.- Los datos tienen 1400 filas y 9 columnas
2.- Los datos tienen las siguientes columnas: ['names', 'gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course', 'math score', 'reading score', 'writing score']
3.- Ejemplos de filas
Los 5 primeros datos son:

Los 5 últimos datos son:

Muestra de 5 datos aleatorios:

4.- Descripción de variables numéricas

5.- La cantidad de valores nulos para cada atributo son:
names                           0
gender                          0
race/ethnicity                  0
parental level of education     0
lunch                           0
test preparation course         0
math score                     40
reading score                  40
writing score                  47
dtype: int64
6.- La cantidad de valores únicos para cada atributo son:
names                          1153
gender                            2
race/ethnicity                    5
parental level of education       6
lunch                             2
test preparation course           2
math score                       96
reading score                    88
writing score                    92
dtype: int64
7.- La cantidad de valores duplicados es 247
8.- Los datos de los nombres duplicados son:

9.- Gráfico de caja:


exploratory_data_analysis(df_grades)

1.- El DataFrame tiene 1400 filas y 9 columnas

2.- El DataFrame esta compuesto por las siguientes columnas: ['names', 'gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course', 'math score', 'reading score', 'writing score']

3.- Ejemplos de filas del DataFrame:

Primeras 5 filas:

Últimas 5 filas:

Muestreo aleatorio de 5 filas:

4.- Descripción numérica del Dataframe:

5.- Cantidad de valores nulos por columna:

names                           0
gender                          0
race/ethnicity                  0
parental level of education     0
lunch                           0
test preparation course         0
math score                     40
reading score                  40
writing score                  47
dtype: int64

6.- Cantidad de valores únicos por columna:

names                          1153
gender                            2
race/ethnicity                    5
parental level of education       6
lunch                             2
test preparation course           2
math score                       96
reading score                    88
writing score                    92
dtype: int64

7.- Presenta un total de 247 filas con nombres duplicados.

8.- Las filas con nombres duplicados son:

9.- Boxplot de notas:


def clean_data(dataframe):
    dropped = dataframe.drop_duplicates(subset="names").dropna()
    valid_score = dropped.loc[dropped[columnas_notas].max(axis=1) <= 100]
    return valid_score.set_index(np.arange(len(valid_score)))


df_grades = clean_data(df_grades)
df_grades


df_grades = clean_data(df_grades)
df_grades


exploratory_data_analysis(df_grades)

1.- Los datos tienen 875 filas y 9 columnas
2.- Los datos tienen las siguientes columnas: ['names', 'gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course', 'math score', 'reading score', 'writing score']
3.- Ejemplos de filas
Los 5 primeros datos son:

Los 5 últimos datos son:

Muestra de 5 datos aleatorios:

4.- Descripción de variables numéricas

5.- La cantidad de valores nulos para cada atributo son:
names                          0
gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64
6.- La cantidad de valores únicos para cada atributo son:
names                          875
gender                           2
race/ethnicity                   5
parental level of education      6
lunch                            2
test preparation course          2
math score                      79
reading score                   72
writing score                   77
dtype: int64
7.- La cantidad de valores duplicados es 0
8.- Los datos de los nombres duplicados son:

9.- Gráfico de caja:


def nota_chilena(dataframe_in, columna):
    """Retorna la columna transformada a escala chilena."""
    return (dataframe_in[columna]*0.06 + 1).round(2)

def con_notas_chilenas(dataframe_in):
    """Retorna un dataframe copiado con las columnas de nota en escala chilena."""
    dataframe_out = dataframe_in.copy()
    dataframe_out[columnas_notas] = nota_chilena(dataframe_in, columnas_notas)
    return dataframe_out


def alumnos_promedio(dataframe_in):
    dataframe_out = dataframe_in.copy()
    dataframe_out["GPA"] = con_notas_chilenas(dataframe_in)[columnas_notas].mean(axis=1).round(2)
    return dataframe_out


df_grades = alumnos_promedio(df_grades)
df_grades.head()


df_grades = alumnos_promedio(df_grades)
df_grades.head()


def reprobados_destacados(dataframe_in):
    mascara_reprobados = dataframe_in.GPA < 4.
    reprobados = dataframe_in[mascara_reprobados]

    modificado = dataframe_in.copy()
    modificado.loc[mascara_reprobados, "GPA"] = "R"

    destacados = dataframe_in.loc[dataframe_in[columnas_notas].min(axis=1) >= 5.8]
    destacados = destacados.sort_values("GPA", ascending=False)

    return reprobados, modificado, destacados


dataframe1, dataframe2, dataframe3 = reprobados_destacados(df_grades)
dataframe1


dataframe2


dataframe3


reprobados, modificado, destacados = reprobados_destacados(df_grades)
print("Reprobados")
display(reprobados)
print("Modificado")
display(modificado)
print("Destacados")
display(destacados)

Reprobados

Modificado

Destacados


lista_alumnos = [
    "Lucille Stanphill",
    "Marcus Mcfarland",
    "Matthew Freeman",
    "Dana Mojica",
    "Paul Hyder",
    "Jeffrey Korn",
    "Robert Strain",
    "Ronald Jett",
    "Lorena Reed",
    "Hazel Posey",
]
consulta_gpa(dataframe2, lista_alumnos)


def consulta_gpa(dataframe_in, lista_alumnos):
    return dataframe_in[dataframe_in.names.isin(lista_alumnos)][["names", "GPA"]]

lista_alumnos = [
    "Lucille Stanphill",
    "Marcus Mcfarland",
    "Matthew Freeman",
    "Dana Mojica",
    "Paul Hyder",
    "Jeffrey Korn",
    "Robert Strain",
    "Ronald Jett",
    "Lorena Reed",
    "Hazel Posey",
]
consulta_gpa(modificado, lista_alumnos)


# Código para quintiles
# La variable que contenga los quintiles debe llamarse GPA cuantiles para que el gráfico funcione.
df_grades["GPA cuantiles"] = pd.qcut(df_grades.GPA, 5,
                                     labels=['Bajísimo', 'Bajo', 'Medio', 'Alto', 'Muy alto'])


# Código de gráfico 
fig = px.scatter(
    df_grades.sort_values(by="GPA"),
    x="GPA",
    y="parental level of education",
    hover_data=["gender"],
    color="GPA cuantiles",
    title="GPA por Nivel de educación de los padres",
    color_discrete_sequence=["tomato", "orange", "yellow", "greenyellow", "green"]
)

fig

	names	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
1395	Alexander Dillon	male	group D	associate's degree	standard	none	193.0	142.0	193.0
1396	Marina Zeigler	female	group C	bachelor's degree	free/reduced	completed	66.0	74.0	81.0
1397	Laurie Carter	female	group B	some high school	standard	completed	54.0	61.0	62.0
1398	Joseph Mccoy	male	group D	some college	free/reduced	none	193.0	196.0	193.0
1399	Amanda Perez	female	group A	high school	standard	completed	68.0	80.0	76.0

	names	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
123	Milton Davidson	male	group C	associate's degree	free/reduced	completed	124.0	NaN	193.0
91	William Austin	male	group C	associate's degree	standard	completed	57.0	54.0	56.0
112	John Salter	male	group D	some college	standard	none	44.0	54.0	53.0
465	Alfonzo Kellar	male	group C	some high school	standard	none	75.0	72.0	62.0
695	Donna Medlin	female	group B	high school	free/reduced	none	38.0	142.0	193.0

	math score	reading score	writing score
count	1360.000000	1360.000000	1353.000000
mean	89.538971	93.086029	91.852919
std	47.152422	46.385775	47.043570
min	0.000000	17.000000	10.000000
25%	59.000000	63.000000	62.000000
50%	72.000000	76.000000	75.000000
75%	97.250000	100.000000	100.000000
max	196.000000	196.000000	196.000000

	names	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
12	Rita Courtney	female	group B	some high school	standard	none	193.0	193.0	193.0
81	Elizabeth Quintero	female	group B	some college	standard	none	82.0	85.0	87.0
160	Michelle Gonzales	female	group B	some high school	standard	completed	60.0	70.0	74.0
170	Carla Mendoza	female	group D	associate's degree	free/reduced	none	43.0	60.0	58.0
176	Darlene Parker	female	group C	associate's degree	standard	none	54.0	61.0	58.0
...	...	...	...	...	...	...	...	...	...
1385	James Crawford	male	group D	master's degree	standard	none	193.0	147.0	131.0
1390	Gloria Bradford	female	group B	high school	standard	none	124.0	64.0	193.0
1393	Adam Romero	male	group C	high school	standard	completed	124.0	193.0	49.0
1395	Alexander Dillon	male	group D	associate's degree	standard	none	193.0	142.0	193.0
1398	Joseph Mccoy	male	group D	some college	free/reduced	none	193.0	196.0	193.0

	names	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
1395	Alexander Dillon	male	group D	associate's degree	standard	none	193.0	142.0	193.0
1396	Marina Zeigler	female	group C	bachelor's degree	free/reduced	completed	66.0	74.0	81.0
1397	Laurie Carter	female	group B	some high school	standard	completed	54.0	61.0	62.0
1398	Joseph Mccoy	male	group D	some college	free/reduced	none	193.0	196.0	193.0
1399	Amanda Perez	female	group A	high school	standard	completed	68.0	80.0	76.0

Laboratorio 3: El Pandas no muerde (act. I) 🐼

Cuerpo Docente:¶

Equipo: SUPER IMPORTANTE - notebooks sin nombre no serán revisados¶

Link de repositorio de GitHub: `https://github.com/johnny-godoy/laboratorios-mds/blob/main/lab%203/laboratorio_3.ipynb`¶

Reglas:¶

Temas a tratar¶

Objetivos principales del laboratorio¶

Librerias utiles para el Lab 😸¶

1. Rendimiento en Estudiantes 📚¶

1.1 Carga e Inspección de Datos [1.5 Puntos]¶

1.2 Limpieza de Datos [1 punto]¶

1.3 Trabajando con Datos¶

1.3.1 Transformación de notas a "Nota Chilena" [0.5 Puntos]¶

1.3.2 Obtención de promedio de alumnos [0.5]¶

1.3.3 Reprobados y Destacados [1.5 Punto, 0.5 Cada DataFrame]¶

1.3.4 Consultas de Alumnos [0.5 Puntos]¶

1.3.5 Quintiles de Rendimiento [0.5 Puntos]¶

Conclusión¶

	names	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
1282	Alberto Roberts	male	group D	associate's degree	standard	none	52.0	55.0	49.0
604	Ronald Wilson	male	group C	some high school	standard	none	64.0	58.0	51.0
559	Christine Reed	female	group E	bachelor's degree	standard	completed	188.0	142.0	131.0
380	Edna Johnson	female	group E	some college	standard	completed	193.0	124.0	196.0
1392	Richard Young	male	group D	high school	standard	none	69.0	75.0	71.0

	names	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
454	Emily Devins	female	group B	associate's degree	standard	none	47.0	49.0	50.0
481	Micki Banks	female	group A	high school	standard	none	55.0	73.0	73.0
160	Michael Reel	male	group D	high school	standard	none	45.0	48.0	46.0
869	Amy Sieving	female	group C	high school	standard	none	29.0	29.0	30.0
409	Hubert Mitchell	male	group D	high school	standard	none	66.0	69.0	63.0

	math score	reading score	writing score
count	875.000000	875.000000	875.000000
mean	66.225143	69.195429	68.114286
std	15.076716	14.629270	15.174220
min	0.000000	17.000000	10.000000
25%	57.000000	59.000000	57.500000
50%	66.000000	70.000000	69.000000
75%	76.500000	79.000000	79.000000
max	100.000000	100.000000	100.000000

	names	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score	GPA
0	Richard Pryor	male	group E	bachelor's degree	standard	completed	7.00	7.00	7.00	7.0
1	Sandra Pompey	female	group E	associate's degree	standard	none	7.00	7.00	7.00	7.0
2	Elizabeth Beasley	female	group E	bachelor's degree	standard	none	7.00	7.00	7.00	7.0
3	Emma Gray	female	group E	bachelor's degree	standard	completed	6.94	7.00	7.00	6.98
4	Emma Brasher	female	group D	some high school	standard	completed	6.82	7.00	7.00	6.94
...	...	...	...	...	...	...	...	...	...	...
89	Rolando Widmer	male	group C	high school	standard	completed	5.92	6.04	5.92	5.96
90	Lucille Stanphill	female	group C	high school	standard	none	5.86	6.04	5.92	5.94
91	Steven Kelly	male	group D	some college	standard	none	5.86	5.92	6.04	5.94
92	Peter Perez	male	group D	master's degree	standard	none	5.86	5.86	6.04	5.92
93	Harry Berry	male	group B	associate's degree	standard	completed	5.86	5.92	5.92	5.9

	names	GPA
198	Lucille Stanphill	5.94
372	Marcus Mcfarland	4.86
418	Matthew Freeman	R
421	Dana Mojica	R
675	Robert Strain	4.4
679	Ronald Jett	R
681	Hazel Posey	R
753	Lorena Reed	R

Laboratorio 3: El Pandas no muerde (act. I) 🐼

Cuerpo Docente:¶

Equipo: SUPER IMPORTANTE - notebooks sin nombre no serán revisados¶

Link de repositorio de GitHub: https://github.com/johnny-godoy/laboratorios-mds/blob/main/lab%203/laboratorio_3.ipynb¶

Reglas:¶

Temas a tratar¶

Objetivos principales del laboratorio¶

Librerias utiles para el Lab 😸¶

1. Rendimiento en Estudiantes 📚¶

1.1 Carga e Inspección de Datos [1.5 Puntos]¶

1.2 Limpieza de Datos [1 punto]¶

1.3 Trabajando con Datos¶

1.3.1 Transformación de notas a "Nota Chilena" [0.5 Puntos]¶

1.3.2 Obtención de promedio de alumnos [0.5]¶

1.3.3 Reprobados y Destacados [1.5 Punto, 0.5 Cada DataFrame]¶

1.3.4 Consultas de Alumnos [0.5 Puntos]¶

1.3.5 Quintiles de Rendimiento [0.5 Puntos]¶

Conclusión¶

Link de repositorio de GitHub: `https://github.com/johnny-godoy/laboratorios-mds/blob/main/lab%203/laboratorio_3.ipynb`¶