library(reticulate)
Importando los paquetes.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
Importando la data.
raw_csv_data = pd.read_csv("E:/Index2018.csv")
df_comp=raw_csv_data.copy()
df_comp.describe()
## spx dax ftse nikkei
## count 6269.000000 6269.000000 6269.000000 6269.000000
## mean 1288.127542 6080.063363 5422.713545 14597.055700
## std 487.586473 2754.361032 1145.572428 4043.122953
## min 438.920000 1911.700000 2876.600000 7054.980000
## 25% 990.671905 4069.350000 4486.100000 10709.290000
## 50% 1233.420000 5773.340000 5662.430000 15028.170000
## 75% 1459.987747 7443.070000 6304.250000 17860.470000
## max 2872.867839 13559.600000 7778.637689 24124.150000
df_comp.head()
## date spx dax ftse nikkei
## 0 07/01/1994 469.90 2224.95 3445.98 18124.01
## 1 10/01/1994 475.27 2225.00 3440.58 18443.44
## 2 11/01/1994 474.13 2228.10 3413.77 18485.25
## 3 12/01/1994 474.17 2182.06 3372.02 18793.88
## 4 13/01/1994 472.47 2142.37 3360.01 18577.26
df_comp.date.describe()
## count 6269
## unique 6269
## top 03/01/2000
## freq 1
## Name: date, dtype: object
Convertir el texto a fechas o serie de tiempo.
df_comp.date = pd.to_datetime(df_comp.date, dayfirst = True)
df_comp.head()
## date spx dax ftse nikkei
## 0 1994-01-07 469.90 2224.95 3445.98 18124.01
## 1 1994-01-10 475.27 2225.00 3440.58 18443.44
## 2 1994-01-11 474.13 2228.10 3413.77 18485.25
## 3 1994-01-12 474.17 2182.06 3372.02 18793.88
## 4 1994-01-13 472.47 2142.37 3360.01 18577.26
df_comp.date.describe()
## count 6269
## unique 6269
## top 1999-11-12 00:00:00
## freq 1
## first 1994-01-07 00:00:00
## last 2018-01-29 00:00:00
## Name: date, dtype: object
##
## <string>:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.
Poniendo la fecha como índice.
df_comp.set_index("date", inplace=True)
df_comp.head()
## spx dax ftse nikkei
## date
## 1994-01-07 469.90 2224.95 3445.98 18124.01
## 1994-01-10 475.27 2225.00 3440.58 18443.44
## 1994-01-11 474.13 2228.10 3413.77 18485.25
## 1994-01-12 474.17 2182.06 3372.02 18793.88
## 1994-01-13 472.47 2142.37 3360.01 18577.26
Ubicando la frecuencia deseada.
df_comp=df_comp.asfreq('d')
df_comp.head(10)
## spx dax ftse nikkei
## date
## 1994-01-07 469.90 2224.95 3445.98 18124.01
## 1994-01-08 NaN NaN NaN NaN
## 1994-01-09 NaN NaN NaN NaN
## 1994-01-10 475.27 2225.00 3440.58 18443.44
## 1994-01-11 474.13 2228.10 3413.77 18485.25
## 1994-01-12 474.17 2182.06 3372.02 18793.88
## 1994-01-13 472.47 2142.37 3360.01 18577.26
## 1994-01-14 474.91 2151.05 3400.56 18973.70
## 1994-01-15 NaN NaN NaN NaN
## 1994-01-16 NaN NaN NaN NaN
df_comp=df_comp.asfreq('b')
df_comp.head(10)
## spx dax ftse nikkei
## date
## 1994-01-07 469.90 2224.95 3445.98 18124.01
## 1994-01-10 475.27 2225.00 3440.58 18443.44
## 1994-01-11 474.13 2228.10 3413.77 18485.25
## 1994-01-12 474.17 2182.06 3372.02 18793.88
## 1994-01-13 472.47 2142.37 3360.01 18577.26
## 1994-01-14 474.91 2151.05 3400.56 18973.70
## 1994-01-17 473.30 2115.56 3407.83 18725.37
## 1994-01-18 474.25 2130.35 3437.01 18514.55
## 1994-01-19 474.30 2132.52 3475.15 19039.40
## 1994-01-20 474.98 2098.36 3469.99 19183.92
Manejo de valores perdidos.
df_comp.isna().sum()
## spx 8
## dax 8
## ftse 8
## nikkei 8
## dtype: int64
df_comp
## spx dax ftse nikkei
## date
## 1994-01-07 469.900000 2224.95 3445.980000 18124.01
## 1994-01-10 475.270000 2225.00 3440.580000 18443.44
## 1994-01-11 474.130000 2228.10 3413.770000 18485.25
## 1994-01-12 474.170000 2182.06 3372.020000 18793.88
## 1994-01-13 472.470000 2142.37 3360.010000 18577.26
## ... ... ... ... ...
## 2018-01-23 2839.130362 13559.60 7731.827774 24124.15
## 2018-01-24 2837.544008 13414.74 7643.428966 23940.78
## 2018-01-25 2839.253031 13298.36 7615.839954 23669.49
## 2018-01-26 2872.867839 13340.17 7665.541292 23631.88
## 2018-01-29 2853.528411 13324.48 7671.533300 23629.34
##
## [6277 rows x 4 columns]
df_comp.spx=df_comp.spx.fillna(method='ffill')
df_comp.isna().sum()
## spx 0
## dax 8
## ftse 8
## nikkei 8
## dtype: int64
df_comp.ftse=df_comp.ftse.fillna(method='bfill')
df_comp.dax=df_comp.dax.fillna(value = df_comp.dax.mean())
df_comp.nikkei=df_comp.dax.fillna(method='bfill')
df_comp.isna().sum()
## spx 0
## dax 0
## ftse 0
## nikkei 0
## dtype: int64
Simplificando el dataset.
df_comp['market_value']=df_comp.spx
df_comp.describe()
## spx dax ftse nikkei market_value
## count 6277.000000 6277.000000 6277.000000 6277.000000 6277.000000
## mean 1288.642547 6080.063363 5423.690398 6080.063363 1288.642547
## std 487.868210 2752.604984 1145.568370 2752.604984 487.868210
## min 438.920000 1911.700000 2876.600000 1911.700000 438.920000
## 25% 992.715221 4070.460000 4487.880000 4070.460000 992.715221
## 50% 1233.761241 5774.380000 5663.300000 5774.380000 1233.761241
## 75% 1460.250000 7442.660000 6304.630175 7442.660000 1460.250000
## max 2872.867839 13559.600000 7778.637689 13559.600000 2872.867839
del df_comp['spx']
df_comp.head()
## dax ftse nikkei market_value
## date
## 1994-01-07 2224.95 3445.98 2224.95 469.90
## 1994-01-10 2225.00 3440.58 2225.00 475.27
## 1994-01-11 2228.10 3413.77 2228.10 474.13
## 1994-01-12 2182.06 3372.02 2182.06 474.17
## 1994-01-13 2142.37 3360.01 2142.37 472.47
del df_comp['dax']
del df_comp['ftse']
del df_comp['nikkei']
df_comp.describe()
## market_value
## count 6277.000000
## mean 1288.642547
## std 487.868210
## min 438.920000
## 25% 992.715221
## 50% 1233.761241
## 75% 1460.250000
## max 2872.867839
Dividiendo la data en conjunto de entrenamiento y de prueba.
size = int(len(df_comp)*0.8)
df = df_comp.iloc[:size]
df_test = df_comp.iloc[size:]
df.tail()
## market_value
## date
## 2013-04-01 1562.173837
## 2013-04-02 1570.252238
## 2013-04-03 1553.686978
## 2013-04-04 1559.979316
## 2013-04-05 1553.278930
df_test.head()
## market_value
## date
## 2013-04-08 1563.071269
## 2013-04-09 1568.607909
## 2013-04-10 1587.731827
## 2013-04-11 1593.369863
## 2013-04-12 1588.854623