#importación librerías
import pandas as pd
#Carga los datos en un DataFrame de Pandas
df = pd.read_csv('base_players_arg.csv', header = 0)
df.head(10)
playerId | competitionId | seasonId | position_code | position_name | total_matches | total_matchesInStart | total_matchesSubstituted | total_matchesComingOff | total_minutesOnField | ... | label | date | dateutc | status_1 | competitionId_2 | seasonId_2 | roundId_1 | gameweek | contractExpiration | agencies | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 86432 | 146 | 188660 | gk | Goalkeeper | 1.0 | 1.0 | 0.0 | 0.0 | 98.0 | ... | Barracas Central - Instituto, 3 - 0 | 2023-05-05 20:30:00+00:00 | 2023-05-05 18:30:00+00:00 | Played | 146 | 188660 | 4428177 | 15 | 2024-12-31 | Agustin Grillo |
1 | 700015 | 146 | 188660 | dmf | Defensive Midfielder | 1.0 | 0.0 | 0.0 | 1.0 | 55.0 | ... | Gimnasia La Plata - Colón, 1 - 0 | 2023-12-01 21:00:00+00:00 | 2023-12-01 20:00:00+00:00 | Played | 146 | 188660 | 4431000 | 1 | 2024-12-31 | NaN |
2 | 893490 | 146 | 188660 | lb5 | Left Back (5 at the back) | 1.0 | 1.0 | 1.0 | 0.0 | 77.0 | ... | Instituto - Barracas Central, 0 - 0 | 2023-11-14 01:00:00+00:00 | 2023-11-14 00:00:00+00:00 | Played | 146 | 188660 | 4429310 | 13 | NaN | NaN |
3 | 708568 | 146 | 188660 | dmf | Defensive Midfielder | 1.0 | 0.0 | 0.0 | 1.0 | 14.0 | ... | Instituto - Barracas Central, 0 - 0 | 2023-11-14 01:00:00+00:00 | 2023-11-14 00:00:00+00:00 | Played | 146 | 188660 | 4429310 | 13 | 2025-12-31 | NaN |
4 | 418027 | 146 | 188660 | cf | Striker | 1.0 | 1.0 | 1.0 | 0.0 | 16.0 | ... | Barracas Central - Instituto, 3 - 0 | 2023-05-05 20:30:00+00:00 | 2023-05-05 18:30:00+00:00 | Played | 146 | 188660 | 4428177 | 15 | 2024-12-31 | Proyecta Players |
5 | 418027 | 146 | 188660 | cf | Striker | 1.0 | 1.0 | 1.0 | 0.0 | 77.0 | ... | Instituto - Barracas Central, 0 - 0 | 2023-11-14 01:00:00+00:00 | 2023-11-14 00:00:00+00:00 | Played | 146 | 188660 | 4429310 | 13 | 2024-12-31 | Proyecta Players |
6 | 88934 | 146 | 188660 | rdmf | Right Defensive Midfielder | 1.0 | 1.0 | 1.0 | 0.0 | 85.0 | ... | Instituto - Barracas Central, 0 - 0 | 2023-11-14 01:00:00+00:00 | 2023-11-14 00:00:00+00:00 | Played | 146 | 188660 | 4429310 | 13 | 2024-12-31 | NaN |
7 | 88934 | 146 | 188660 | dmf | Defensive Midfielder | 1.0 | 1.0 | 1.0 | 0.0 | 85.0 | ... | Barracas Central - Instituto, 3 - 0 | 2023-05-05 20:30:00+00:00 | 2023-05-05 18:30:00+00:00 | Played | 146 | 188660 | 4428177 | 15 | 2024-12-31 | NaN |
8 | 364018 | 146 | 188660 | lcb | Left Centre Back | 1.0 | 1.0 | 0.0 | 0.0 | 98.0 | ... | Barracas Central - Instituto, 3 - 0 | 2023-05-05 20:30:00+00:00 | 2023-05-05 18:30:00+00:00 | Played | 146 | 188660 | 4428177 | 15 | 2024-12-31 | Marco Vanzini CAASTELLAR GROUP |
9 | 778525 | 146 | 188660 | rcmf | Right Centre Midfielder | 1.0 | 1.0 | 0.0 | 0.0 | 104.0 | ... | Gimnasia La Plata - Colón, 1 - 0 | 2023-12-01 21:00:00+00:00 | 2023-12-01 20:00:00+00:00 | Played | 146 | 188660 | 4431000 | 1 | NaN | NaN |
10 rows × 314 columns
#Conocer número de filas y número de columnas
df.shape
(38873, 314)
print(df.dtypes)
playerId int64 competitionId int64 seasonId int64 position_code object position_name object ... seasonId_2 int64 roundId_1 int64 gameweek int64 contractExpiration object agencies object Length: 314, dtype: object
# Mostrar cuántos valores nulos hay en cada columna
print(df.isnull().sum())
playerId 0 competitionId 0 seasonId 0 position_code 0 position_name 0 ... seasonId_2 0 roundId_1 0 gameweek 0 contractExpiration 5471 agencies 18268 Length: 314, dtype: int64
#Eliminar filas con valores nulos
df_sin_nulos = df.dropna()
df_sin_nulos.head(10)
playerId | competitionId | seasonId | position_code | position_name | total_matches | total_matchesInStart | total_matchesSubstituted | total_matchesComingOff | total_minutesOnField | ... | label | date | dateutc | status_1 | competitionId_2 | seasonId_2 | roundId_1 | gameweek | contractExpiration | agencies |
---|
0 rows × 314 columns
# Identificar columnas con valores nulos
columnas_con_nulos = df.columns[df.isnull().any()].tolist()
# Mostrar las columnas que tienen al menos un valor nulo
print("Columnas que contienen valores nulos:")
print(columnas_con_nulos)
Columnas que contienen valores nulos: ['teamId', 'middleName', 'birthDate', 'age', 'foot', 'currentTeamId', 'currentNationalTeamId', 'imageDataURL', 'teamIdFormador', 'currentTeamIsFormador', 'condicionFormados', 'ldp_player_id', 'contractExpiration', 'agencies']
# Determinar cuántos valores nulos tiene cada columna
if columnas_con_nulos:
print("\nCantidad de valores nulos por columna:")
for columna in columnas_con_nulos:
num_nulos = df[columna].isnull().sum()
print(f"{columna}: {num_nulos} valores nulos")
else:
print("No hay columnas con valores nulos en el DataFrame.")
Cantidad de valores nulos por columna: teamId: 257 valores nulos middleName: 38873 valores nulos birthDate: 1 valores nulos age: 1 valores nulos foot: 112 valores nulos currentTeamId: 714 valores nulos currentNationalTeamId: 35658 valores nulos imageDataURL: 88 valores nulos teamIdFormador: 638 valores nulos currentTeamIsFormador: 714 valores nulos condicionFormados: 31677 valores nulos ldp_player_id: 30 valores nulos contractExpiration: 5471 valores nulos agencies: 18268 valores nulos
# Mostrar el tipo de datos de cada columna con valores nulos
df.dtypes[columnas_con_nulos]
teamId float64 middleName float64 birthDate object age float64 foot object currentTeamId float64 currentNationalTeamId float64 imageDataURL object teamIdFormador float64 currentTeamIsFormador object condicionFormados object ldp_player_id float64 contractExpiration object agencies object dtype: object
df.drop(columns=["middleName"], inplace=True)
df[["foot", "imageDataURL", "condicionFormados", "agencies"]] = df[["foot", "imageDataURL", "condicionFormados", "agencies"]].fillna("Desconocido")
df["birthDate"] = pd.to_datetime(df["birthDate"], errors='coerce')
df["contractExpiration"] = pd.to_datetime(df["contractExpiration"], errors='coerce')
df["birthDate"].fillna(pd.to_datetime("1900-01-01"), inplace=True)
df["contractExpiration"].fillna(pd.to_datetime("1900-01-01"), inplace=True)
df["age"].fillna(df["age"].mean(), inplace=True)
df[["teamId", "currentTeamId", "currentNationalTeamId", "teamIdFormador", "ldp_player_id"]] = df[["teamId", "currentTeamId", "currentNationalTeamId", "teamIdFormador", "ldp_player_id"]].fillna(-1)
df["currentTeamIsFormador"] = df["currentTeamIsFormador"].astype(bool)
El dataset contiene estadísticas de cada jugador por partido, por lo que es normal que haya múltiples filas para un mismo jugador. En este caso, el jugador por sí solo no puede ser considerado como clave primaria.
Los identificadores utilizados para partidos y jugadores son matchId y playerId. Para definir una clave primaria, combinaremos ambos campos.
Para detectar posibles duplicados, agregamos una nueva columna que resulta de la unión de estos dos identificadores.
Solo para el cálculo de esta nueva columna, convertimos ambas variables a formato string y así generamos nuestra clave primaria.
df['primary_key'] = df['playerId'].astype(str) + '_' + df['matchId'].astype(str)
df.head()
playerId | competitionId | seasonId | position_code | position_name | total_matches | total_matchesInStart | total_matchesSubstituted | total_matchesComingOff | total_minutesOnField | ... | date | dateutc | status_1 | competitionId_2 | seasonId_2 | roundId_1 | gameweek | contractExpiration | agencies | primary_key | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 86432 | 146 | 188660 | gk | Goalkeeper | 1.0 | 1.0 | 0.0 | 0.0 | 98.0 | ... | 2023-05-05 20:30:00+00:00 | 2023-05-05 18:30:00+00:00 | Played | 146 | 188660 | 4428177 | 15 | 2024-12-31 | Agustin Grillo | 86432_5420312 |
1 | 700015 | 146 | 188660 | dmf | Defensive Midfielder | 1.0 | 0.0 | 0.0 | 1.0 | 55.0 | ... | 2023-12-01 21:00:00+00:00 | 2023-12-01 20:00:00+00:00 | Played | 146 | 188660 | 4431000 | 1 | 2024-12-31 | Desconocido | 700015_5543867 |
2 | 893490 | 146 | 188660 | lb5 | Left Back (5 at the back) | 1.0 | 1.0 | 1.0 | 0.0 | 77.0 | ... | 2023-11-14 01:00:00+00:00 | 2023-11-14 00:00:00+00:00 | Played | 146 | 188660 | 4429310 | 13 | 1900-01-01 | Desconocido | 893490_5463692 |
3 | 708568 | 146 | 188660 | dmf | Defensive Midfielder | 1.0 | 0.0 | 0.0 | 1.0 | 14.0 | ... | 2023-11-14 01:00:00+00:00 | 2023-11-14 00:00:00+00:00 | Played | 146 | 188660 | 4429310 | 13 | 2025-12-31 | Desconocido | 708568_5463692 |
4 | 418027 | 146 | 188660 | cf | Striker | 1.0 | 1.0 | 1.0 | 0.0 | 16.0 | ... | 2023-05-05 20:30:00+00:00 | 2023-05-05 18:30:00+00:00 | Played | 146 | 188660 | 4428177 | 15 | 2024-12-31 | Proyecta Players | 418027_5420312 |
5 rows × 314 columns
df['primary_key'].nunique()
22873
Usamos el método groupby para identificar las claves primarias que aparecen más de una vez.
df.groupby('primary_key')['primary_key'].count().sort_values(ascending = False)
primary_key 582416_5420331 3 543341_5578281 3 543341_5463605 3 543341_5420459 3 543341_5420446 3 .. 662914_5463628 1 218643_5420323 1 4376_5420119 1 662914_5578170 1 640425_5420422 1 Name: primary_key, Length: 22873, dtype: int64
Inicialmente, contábamos con 38,873 filas, pero solo 22,873 registros únicos según la clave primaria. El método drop_duplicates permite eliminar filas duplicadas en función de una columna específica. En este caso, lo aplicamos sobre primary_key para conservar únicamente los registros únicos.
df.drop_duplicates(subset=['primary_key'], inplace = True)
df.shape
(22873, 314)
df['playerId'].nunique()
1267
df_jugadores = df.drop_duplicates(subset=['playerId'])
df_jugadores['age'].mean()
26.44324393435529
df_jugadores['position_name'].value_counts().head(5)
position_name Striker 188 Right Centre Midfielder 93 Left Centre Midfielder 85 Left Back 79 Goalkeeper 73 Name: count, dtype: int64
import matplotlib.pyplot as plt
import seaborn as sns
# Crear el histograma
plt.figure(figsize=(10, 6))
sns.histplot(df_jugadores['age'], bins=20, kde=True, color='blue')
<Axes: xlabel='age', ylabel='Count'>