Proyecto Final¶

Alumno: Yaretsi Bermudez¶

In [ ]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [ ]:
import pandas as pd

# Intenta detectar el delimitador automáticamente
df = pd.read_csv('/content/drive/MyDrive/DATOS_LNA.csv', sep=None, engine='python')

# Si conoces el delimitador, especifícalo directamente
# df = pd.read_csv('/content/drive/MyDrive/DATOS_LNA.csv', sep=',')  # Si es una coma
# df = pd.read_csv('/content/drive/MyDrive/DATOS_LNA.csv', sep=';')  # Si es un punto y coma

df.shape
Out[ ]:
(553, 52)
In [ ]:
# variables categóricas
label_encoder_NumHuevo = LabelEncoder()
label_encoder_NumPicho = LabelEncoder()
label_encoder_Evid_Saque = LabelEncoder()
label_encoder_Depredado = LabelEncoder()

df['NomCien'] = label_encoder_NumHuevo.fit_transform(df['NumHuevo'])
df['Sitio'] = label_encoder_NumPicho.fit_transform(df['NumPicho'])
df['Evid_Saque'] = label_encoder_Evid_Saque.fit_transform(df['Evid_Saque'])
df['Depredado'] = label_encoder_Depredado.fit_transform(df['Depredado'])

print(df.head())
   ID CodArbol  NomCien     NomCom EspePsitac  Sitio   Año Temporada  \
0    1    PSSG1        4  Genizaro         LNA      0  2019   Oct-Ene   
1    2    PSAP2        4     Panama        LNA      1  2019   Oct-Ene   
2    4    PSAP3        2     Panama        LNA      4  2019   Oct-Ene   
3    5    PHCJ4        1    Jabillo        LNA      4  2019   Oct-Ene   
4    6    PBAO5        2    Ojoche         LNA      1  2019   Oct-Ene   

  MesEscal Escalado  ... ProfEntCavid DAPArbol                    Clima  \
0      Nov       Si  ...           74      324           Soleado viento   
1      Dic       Si  ...           85     13.2  50%nublado mucho viento   
2      Nov       Si  ...           80     11.2  30%nublado mucho viento   
3      Dic       Si  ...           70      4.6   20% nublado sin viento   
4      Dic       Si  ...           29      3.7                 Soleado    

  Nubosidad  Nubosid%  Soleado Solead% Precip  Viento  ActivoAno  
0       NaN       NaN  Soleado     NaN    NaN    Poco         No  
1   Nublado      50.0      NaN     NaN    NaN  Fuerte         Si  
2   Nublado      30.0      NaN     NaN    NaN  Fuerte         Si  
3   Nublado      20.0      NaN     NaN    NaN     NaN         Si  
4       NaN       NaN  Soleado     NaN    NaN     NaN         Si  

[5 rows x 52 columns]
In [ ]:
df.head()
Out[ ]:
ID CodArbol NomCien NomCom EspePsitac Sitio Año Temporada MesEscal Escalado ... ProfEntCavid DAPArbol Clima Nubosidad Nubosid% Soleado Solead% Precip Viento ActivoAno
0 1 PSSG1 4 Genizaro LNA 0 2019 Oct-Ene Nov Si ... 74 324 Soleado viento NaN NaN Soleado NaN NaN Poco No
1 2 PSAP2 4 Panama LNA 1 2019 Oct-Ene Dic Si ... 85 13.2 50%nublado mucho viento Nublado 50.0 NaN NaN NaN Fuerte Si
2 4 PSAP3 2 Panama LNA 4 2019 Oct-Ene Nov Si ... 80 11.2 30%nublado mucho viento Nublado 30.0 NaN NaN NaN Fuerte Si
3 5 PHCJ4 1 Jabillo LNA 4 2019 Oct-Ene Dic Si ... 70 4.6 20% nublado sin viento Nublado 20.0 NaN NaN NaN NaN Si
4 6 PBAO5 2 Ojoche LNA 1 2019 Oct-Ene Dic Si ... 29 3.7 Soleado NaN NaN Soleado NaN NaN NaN Si

5 rows × 52 columns

In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 553 entries, 0 to 552
Data columns (total 52 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID           553 non-null    int64  
 1   CodArbol      553 non-null    object 
 2   NomCien       553 non-null    int64  
 3   NomCom        553 non-null    object 
 4   EspePsitac    553 non-null    object 
 5   Sitio         553 non-null    int64  
 6   Año           553 non-null    int64  
 7   Temporada     548 non-null    object 
 8   MesEscal      553 non-null    object 
 9   Escalado      553 non-null    object 
 10  CodNido       553 non-null    object 
 11  ConCavi       533 non-null    object 
 12  ConCavi_B     533 non-null    object 
 13  ConCavi_R     533 non-null    object 
 14  ConCavi_M     533 non-null    float64
 15  Humedo        533 non-null    object 
 16  PocoHume      533 non-null    object 
 17  Seca          533 non-null    object 
 18  Evid_Activ    15 non-null     float64
 19  Evid_Saque    553 non-null    int64  
 20  Depredado     553 non-null    int64  
 21  CompPadre     471 non-null    object 
 22  PresePadr     472 non-null    float64
 23  UbicPadr      148 non-null    object 
 24  CompPadr      171 non-null    object 
 25  NumHuevo      117 non-null    float64
 26  NumPicho      258 non-null    float64
 27  PichoSaque    53 non-null     object 
 28  PichoMuerto   14 non-null     float64
 29  PichoVuelEx   202 non-null    float64
 30  PichoRehab    7 non-null      float64
 31  EstadPicho    203 non-null    object 
 32  Edad1Dia      183 non-null    float64
 33  Edad2Dia      121 non-null    float64
 34  Edad3Dia      49 non-null     float64
 35  Edad4Dia      1 non-null      float64
 36  AltCavidad    494 non-null    float64
 37  DirCavidad    441 non-null    object 
 38  DirGrado      382 non-null    float64
 39  DirPuntCard   404 non-null    object 
 40  AltEntCavid   459 non-null    float64
 41  AncEntCavid   473 non-null    float64
 42  ProfEntCavid  469 non-null    object 
 43  DAPArbol      490 non-null    object 
 44  Clima         523 non-null    object 
 45  Nubosidad     270 non-null    object 
 46  Nubosid%      224 non-null    float64
 47  Soleado       211 non-null    object 
 48  Solead%       85 non-null     float64
 49  Precip        11 non-null     object 
 50  Viento        347 non-null    object 
 51  ActivoAno     546 non-null    object 
dtypes: float64(18), int64(6), object(28)
memory usage: 224.8+ KB
In [ ]:
df.describe()
Out[ ]:
ID NomCien Sitio Año ConCavi_M Evid_Activ Evid_Saque Depredado PresePadr NumHuevo ... Edad1Dia Edad2Dia Edad3Dia Edad4Dia AltCavidad DirGrado AltEntCavid AncEntCavid Nubosid% Solead%
count 553.000000 553.000000 553.000000 553.000000 533.000000 15.000000 553.000000 553.000000 472.000000 117.000000 ... 183.000000 121.000000 49.000000 1.0 494.000000 382.000000 459.000000 473.000000 224.000000 85.000000
mean 300.781193 3.471971 2.556962 2021.094033 0.120075 0.733333 1.115732 1.133816 0.792373 1.504274 ... 29.628415 27.884298 24.285714 60.0 14.214730 178.378796 29.188017 14.375264 55.553571 85.470588
std 173.578703 1.111383 1.638783 1.902092 0.325355 0.457738 0.426889 0.450639 0.406039 0.961536 ... 15.321954 15.259527 13.604534 NaN 5.919842 106.788897 28.088252 9.143355 31.790186 25.208931
min 1.000000 0.000000 0.000000 2017.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 2.000000 1.000000 0.000000 60.0 0.451389 0.000000 1.300000 4.000000 5.000000 20.000000
25% 150.000000 4.000000 1.000000 2020.000000 0.000000 0.500000 1.000000 1.000000 1.000000 1.000000 ... 18.000000 15.000000 16.000000 60.0 10.600000 94.000000 14.000000 9.000000 20.000000 90.000000
50% 300.000000 4.000000 4.000000 2021.000000 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 30.000000 28.000000 21.000000 60.0 13.900000 180.000000 20.000000 12.000000 50.000000 100.000000
75% 453.000000 4.000000 4.000000 2023.000000 0.000000 1.000000 1.000000 1.000000 1.000000 2.000000 ... 42.000000 40.000000 34.000000 60.0 17.285000 270.000000 33.000000 17.000000 90.000000 100.000000
max 595.000000 4.000000 4.000000 2024.000000 1.000000 1.000000 3.000000 3.000000 1.000000 3.000000 ... 62.000000 60.000000 60.000000 60.0 100.000000 713.000000 240.000000 130.000000 100.000000 100.000000

8 rows × 24 columns

In [ ]:
bottle_df = bottle_df[:][:500]      # Tomemos el límite para el cálculo de la regresión de velocidad
bottle_df.head()
Out[ ]:
NumHuevo NumPicho
0 0.0 1.0
1 0.0 2.0
2 2.0 2.0
3 1.0 2.0
4 2.0 2.0

Distributions

No description has been provided for this image
No description has been provided for this image

2-d distributions

No description has been provided for this image

Time series

No description has been provided for this image
No description has been provided for this image

Values

No description has been provided for this image
No description has been provided for this image
In [ ]:
# Extract 2 columns 'NomCom','Año'
bottle_df = bottle[['NomCom','Año']]

# And called again
bottle_df.columns = ['NomCom','Año']
In [ ]:
bottle_df = bottle_df[:][:500]      # Tomemos el límite para el cálculo de la regresión de velocidad
bottle_df.head()
Out[ ]:
NomCom Año
0 Genizaro 2019
1 Panama 2019
2 Panama 2019
3 Jabillo 2019
4 Ojoche 2019

Categorical distributions

No description has been provided for this image

Time series

No description has been provided for this image
In [ ]:
from matplotlib import pyplot as plt
import seaborn as sns
_df_7.groupby('NomCom').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)
No description has been provided for this image
In [ ]:
# @title Año

from matplotlib import pyplot as plt
bottle_df['Año'].plot(kind='line', figsize=(8, 4), title='Año')
plt.gca().spines[['top', 'right']].set_visible(False)
No description has been provided for this image
In [ ]:
# @title Año

from matplotlib import pyplot as plt
bottle_df['Año'].plot(kind='hist', bins=20, title='Año')
plt.gca().spines[['top', 'right',]].set_visible(False)
No description has been provided for this image
In [ ]:
# Extract 2 columns 'Sitio','Temporada'
bottle_df = bottle[['Sitio','Temporada']]

# And called again
bottle_df.columns = ['Sitio','Temporada']
In [ ]:
bottle_df = bottle_df[:][:500]      # Tomemos el límite para el cálculo de la regresión de velocidad
bottle_df.head()
Out[ ]:
Sitio Temporada
0 Peña inculta Oct-Ene
1 Peña inculta Oct-Ene
2 Peña inculta Oct-Ene
3 Peña inculta Oct-Ene
4 Peña inculta Oct-Ene
No charts were generated by quickchart
In [ ]:
# @title Temporada

from matplotlib import pyplot as plt
import seaborn as sns
bottle_df.groupby('Temporada').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)
No description has been provided for this image

Datos¶

In [ ]:
# Separar las características (features) y el objetivo (target)
X = df[['NumHuevo']]
y = df['NumPicho']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Separar las características (features) y el objetivo (target)
X = df[['Evid_Saque']]
y = df['Depredado']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [ ]:
# Crear el modelo de Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Entrenar el modelo
model.fit(X_train, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

# Evaluar el modelo
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.88      1.00      0.94        97
           2       0.00      0.00      0.00        12
           3       1.00      1.00      1.00         1

    accuracy                           0.88       111
   macro avg       0.47      0.50      0.48       111
weighted avg       0.78      0.88      0.83       111

[[ 0  1  0  0]
 [ 0 97  0  0]
 [ 0 12  0  0]
 [ 0  0  0  1]]
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

Grafica de dispersion¶

In [ ]:
plt.figure(figsize=(15, 8))
sns.scatterplot(x='NumHuevo', y='NumPicho', hue='Temporada', data=df)
plt.title('Num Huevos vs Num Pichones por Temporada')
plt.xlabel('NumPicho')
plt.ylabel('NumHuevo')
plt.legend()
plt.show()
No description has been provided for this image
In [ ]:
plt.figure(figsize=(15, 8))
sns.scatterplot(x='Evid_Saque', y='Depredado', hue='Temporada', data=df)
plt.title('Evidencia de Saqueo vs Depredados por Temporada')
plt.xlabel('Evid_Saque')
plt.ylabel('Depredado')
plt.legend()
plt.show()
No description has been provided for this image
Grafico de dispersion vs 2¶
In [ ]:
# Separar las características (features) y los objetivos (targets)
X = df[['NumPicho', 'NumHuevo']]
y_evid_saque = df['Evid_Saque']
y_depredado = df['Depredado']

# Codificar los objetivos si son categóricos
label_encoder_evid_saque = LabelEncoder()
label_encoder_depredado = LabelEncoder()

y_evid_saque = label_encoder_evid_saque.fit_transform(y_evid_saque)
y_depredado = label_encoder_depredado.fit_transform(y_depredado)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train_evid_saque, y_test_evid_saque = train_test_split(X, y_evid_saque, test_size=0.2, random_state=42)
X_train, X_test, y_train_depredado, y_test_depredado = train_test_split(X, y_depredado, test_size=0.2, random_state=42)
In [ ]:
# Manejar valores faltantes antes de entrenar el modelo
X = df[['NumPicho', 'NumHuevo']].fillna({'NumPicho': 0, 'NumHuevo': 0})  # Reemplazar NaN con 0 para 'NumPicho' y ¨NumHuevo¨
Y = df[['NumPicho', 'NumHuevo']].fillna({'Evid_Saque': "No", 'Depredado': "No"})  # Reemplazar NaN con "No" para 'Evid_Saque' y ¨Depredado¨
In [ ]:
# Manejar valores faltantes en X_train y X_test
X_train[['NumPicho', 'NumHuevo']] = X_train[['NumPicho', 'NumHuevo']].fillna(0)  # Reemplazar NaN con 0 para 'NumPicho' y 'NumHuevo' en X_train
X_test[['NumPicho', 'NumHuevo']] = X_test[['NumPicho', 'NumHuevo']].fillna(0)  # Reemplazar NaN con 0 para 'NumPicho' y 'NumHuevo' en X_test

# Crear el modelo de regresión lineal
model_evid_saque = LinearRegression()
model_depredado = LinearRegression()

# Entrenar el modelo
model_evid_saque.fit(X_train, y_train_evid_saque)
model_depredado.fit(X_train, y_train_depredado)

# Hacer predicciones en el conjunto de prueba
y_pred_evid_saque = model_evid_saque.predict(X_test)
y_pred_depredado = model_depredado.predict(X_test)
In [ ]:
plt.figure(figsize=(5, 3))
plt.scatter(X_test['NumHuevo'], y_test_evid_saque, color='blue', label='Datos Reales')
plt.scatter(X_test['NumHuevo'], y_pred_evid_saque, color='orange', label='Predicciones')
plt.title('Regresión Lineal: Num de Huevos vs Depredado')
plt.xlabel('NumHuevo')
plt.ylabel('Depredado')
plt.legend()
plt.show()

plt.figure(figsize=(5, 3))
plt.scatter(X_test['NumPicho'], y_test_evid_saque, color='blue', label='Datos Reales')
plt.scatter(X_test['NumPicho'], y_pred_evid_saque, color='orange', label='Predicciones')
plt.title('Regresión Lineal: Evid_Saque vs Num de Pichones')
plt.xlabel('NumPicho')
plt.ylabel('Evid_Saque')
plt.legend()
plt.show()
No description has been provided for this image
No description has been provided for this image

Grafico de correlaciones¶

In [ ]:
# Selecciona solo columnas numéricas para el cálculo de correlación
numeric_df = df.select_dtypes(include=['number'])

plt.figure(figsize=(10, 6))
correlation_matrix = numeric_df.corr()  # Calcula la correlación en datos numéricos
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Mapa de Calor de Correlaciones')
plt.show()
No description has been provided for this image
In [ ]:
# Selecciona las columnas que te interesan
columnas_especificas = ['NumHuevo', 'NumPicho', 'Evid_Saque', 'Depredado']
df_especifico = df[columnas_especificas]

# Calcula la matriz de correlación
correlation_matrix = df_especifico.corr()

# Crea el mapa de calor
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Mapa de Calor de Correlaciones (Columnas Específicas)')
plt.show()
No description has been provided for this image

Grafico pastel¶

In [ ]:
# Contar las ocurrencias de cada categoría en Evid_Saque
evid_saque_counts = df['Evid_Saque'].value_counts()

# Crear el gráfico de pastel con colores personalizados
colors = ['Skyblue','Pink',]
plt.figure(figsize=(8, 8))
plt.pie(evid_saque_counts, labels=evid_saque_counts.index, autopct='%1.1f%%', startangle=140, colors=colors)
plt.title('Distribución de Evidencia de Saque')
plt.show()
No description has been provided for this image
In [ ]:
# Contar las ocurrencias de cada categoría en Depredado
depredado_counts = df['Depredado'].value_counts()

# Crear el gráfico de pastel
colors = ['Skyblue','Purple']
plt.figure(figsize=(8, 8))
plt.pie(depredado_counts, labels=depredado_counts.index, autopct='%1.1f%%', startangle=140, colors=colors)
plt.title('Distribución de Depredado')
plt.show()
No description has been provided for this image

Modelo Regresion lineal¶

In [ ]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
import operator

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

# Intenta detectar el delimitador automáticamente
df = pd.read_csv('/content/drive/MyDrive/DATOS_LNA.csv', sep=None, engine='python')

# Si conoces el delimitador, especifícalo directamente
# df = pd.read_csv('/content/drive/MyDrive/DATOS_LNA.csv', sep=',')  # Si es una coma
# df = pd.read_csv('/content/drive/MyDrive/DATOS_LNA.csv', sep=';')  # Si es un punto y coma

df.shape
bottle.head()
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Out[ ]:
ID CodArbol NomCien NomCom EspePsitac Sitio Año Temporada MesEscal Escalado ... ProfEntCavid DAPArbol Clima Nubosidad Nubosid% Soleado Solead% Precip Viento ActivoAno
0 1 PSSG1 Samanea saman Genizaro LNA Peña inculta 2019 Oct-Ene Nov Si ... 74 324 Soleado viento NaN NaN Soleado NaN NaN Poco No
1 2 PSAP2 Sterculia apetala Panama LNA Peña inculta 2019 Oct-Ene Dic Si ... 85 13.2 50%nublado mucho viento Nublado 50.0 NaN NaN NaN Fuerte Si
2 4 PSAP3 Sterculia apetala Panama LNA Peña inculta 2019 Oct-Ene Nov Si ... 80 11.2 30%nublado mucho viento Nublado 30.0 NaN NaN NaN Fuerte Si
3 5 PHCJ4 Hura crepitans Jabillo LNA Peña inculta 2019 Oct-Ene Dic Si ... 70 4.6 20% nublado sin viento Nublado 20.0 NaN NaN NaN NaN Si
4 6 PBAO5 Brosimum alicastrum Ojoche LNA Peña inculta 2019 Oct-Ene Dic Si ... 29 3.7 Soleado NaN NaN Soleado NaN NaN NaN Si

5 rows × 52 columns

In [ ]:
bottle.describe()
Out[ ]:
ID Año ConCavi_M Evid_Activ PresePadr NumHuevo NumPicho PichoMuerto PichoVuelEx PichoRehab Edad1Dia Edad2Dia Edad3Dia Edad4Dia AltCavidad DirGrado AltEntCavid AncEntCavid Nubosid% Solead%
count 553.000000 553.000000 533.000000 15.000000 472.000000 117.000000 258.000000 14.000000 202.000000 7.000000 183.000000 121.000000 49.000000 1.0 494.000000 382.000000 459.000000 473.000000 224.000000 85.000000
mean 300.781193 2021.094033 0.120075 0.733333 0.792373 1.504274 1.906977 1.357143 1.891089 0.857143 29.628415 27.884298 24.285714 60.0 14.214730 178.378796 29.188017 14.375264 55.553571 85.470588
std 173.578703 1.902092 0.325355 0.457738 0.406039 0.961536 0.803125 0.841897 0.796772 0.899735 15.321954 15.259527 13.604534 NaN 5.919842 106.788897 28.088252 9.143355 31.790186 25.208931
min 1.000000 2017.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.000000 0.000000 2.000000 1.000000 0.000000 60.0 0.451389 0.000000 1.300000 4.000000 5.000000 20.000000
25% 150.000000 2020.000000 0.000000 0.500000 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 18.000000 15.000000 16.000000 60.0 10.600000 94.000000 14.000000 9.000000 20.000000 90.000000
50% 300.000000 2021.000000 0.000000 1.000000 1.000000 1.000000 2.000000 1.000000 2.000000 1.000000 30.000000 28.000000 21.000000 60.0 13.900000 180.000000 20.000000 12.000000 50.000000 100.000000
75% 453.000000 2023.000000 0.000000 1.000000 1.000000 2.000000 3.000000 1.750000 2.000000 1.500000 42.000000 40.000000 34.000000 60.0 17.285000 270.000000 33.000000 17.000000 90.000000 100.000000
max 595.000000 2024.000000 1.000000 1.000000 1.000000 3.000000 4.000000 3.000000 4.000000 2.000000 62.000000 60.000000 60.000000 60.0 100.000000 713.000000 240.000000 130.000000 100.000000 100.000000
In [ ]:
# Extraer dos columnas 'NumHuevo','NumPicho'
bottle_df = bottle[['NumHuevo','NumPicho']]

# And called again
bottle_df.columns = ['NumHuevo', 'NumPicho']
In [ ]:
bottle_df = bottle_df[:][:500]
bottle_df.head()
Out[ ]:
NumHuevo NumPicho
0 NaN 1.0
1 NaN 2.0
2 2.0 NaN
3 1.0 NaN
4 2.0 2.0
In [ ]:
# See picture with scatter or plot method

# sns.lmplot(x="NumHuevo", y="NumPicho", data=bottle_df, order=2, ci=None);
sns.pairplot(bottle_df, kind="reg")
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x7b1021d40d00>
No description has been provided for this image
In [ ]:
# see how many null values we have

bottle_df.isnull().sum()
Out[ ]:
0
NumHuevo 391
NumPicho 266

In [ ]:
# Drop NaN or missing input numbers

bottle_df.fillna(method='ffill', inplace=True)
#bottle_df.isnull().sum()
<ipython-input-196-5497b30dd99e>:3: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  bottle_df.fillna(method='ffill', inplace=True)
In [ ]:
# Features chose

X = np.array(bottle_df['NumHuevo']).reshape(-1, 1)
y = np.array(bottle_df['NumPicho']).reshape(-1, 1)
In [ ]:
# Split data as %20 is test and %80 is train set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
Grafico Regresion Lineal¶
In [ ]:
# Drop NaN or missing input numbers
bottle_df.fillna(method='ffill', inplace=True)

# Features chose
X = np.array(bottle_df['NumHuevo']).reshape(-1, 1)
y = np.array(bottle_df['NumPicho']).reshape(-1, 1)

# Split data as %20 is test and %80 is train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

# Check for NaNs in the training and testing sets and replace them with 0
X_train = np.nan_to_num(X_train, nan=0)
y_train = np.nan_to_num(y_train, nan=0)
X_test = np.nan_to_num(X_test, nan=0)
y_test = np.nan_to_num(y_test, nan=0)

#Fit the model
from sklearn.linear_model import LinearRegression
lin_df = LinearRegression()
lin_df.fit(X_train, y_train)
<ipython-input-207-52cba31ba30a>:2: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  bottle_df.fillna(method='ffill', inplace=True)
Out[ ]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [ ]:
y_pred = lin_df.predict(X_test)                                     # Predict Linear Model
accuracy_score = lin_df.score(X_test, y_test)                       # Accuracy score
print("Linear Regression Model Accuracy Score: " + "{:.1%}".format(accuracy_score))
Linear Regression Model Accuracy Score: -0.5%
In [ ]:
from sklearn.metrics import mean_squared_error,r2_score

print("R2 Score: " +"{:.3}".format(r2_score(y_test, y_pred)));
R2 Score: -0.00496
In [ ]:
# Dibujar el modelo

plt.scatter(X_test, y_test, color='Skyblue')
plt.plot(X_test, y_pred, color='purple')
plt.show()
No description has been provided for this image

Regresion poligonal¶

In [ ]:
from sklearn.preprocessing import PolynomialFeatures

poly_df = PolynomialFeatures(degree = 4)
transform_poly = poly_df.fit_transform(X_train)

linreg2 = LinearRegression()
linreg2.fit(transform_poly,y_train)

polynomial_predict = linreg2.predict(transform_poly)
In [ ]:
rmse = np.sqrt(mean_squared_error(y_train,polynomial_predict))
r2 = r2_score(y_train,polynomial_predict)
print("RMSE Score for Test set: " +"{:.2}".format(rmse))
print("R2 Score for Test set: " +"{:.2}".format(r2))
RMSE Score for Test set: 0.81
R2 Score for Test set: 0.024
In [ ]:
plt.scatter(X_train, y_train, s=50)
# sort the values of x before line plot
sort_axis = operator.itemgetter(0)
sorted_zip = sorted(zip(X_train,polynomial_predict), key=sort_axis)
X_train, polynomial_predict = zip(*sorted_zip)
plt.plot(X_train, polynomial_predict, color='m')
plt.show()
No description has been provided for this image
In [ ]: