Carregando pacotes



if(!require("pacman")) install.packages("pacman")

pacman::p_load(tidyverse,
               lubridate,
               ggpubr,
               ggstatsplot,
               naniar,
               simputation,
               caret,
               janitor,
               knitr,
               h2o)

knitr::opts_chunk$set(echo = TRUE)

Carregar o banco de dados


datatrain <- read.csv("dados_treinamento.csv")
datatest <- read.csv("dados_teste.csv")

Estrutura dos dados

datatrain %>% glimpse
Rows: 16,440
Columns: 57
$ TEST_SET_NAME                  <chr> "NCB_BIA005", "NCB_BIA001", "NCB_BIA002", "NCB_BIA005", "NCB_BIA005",~
$ EXPERIMENT_STAGE_NAME          <chr> "PCM2", "PCM2", "PCM2", "PCM2", "PCM2", "PCM2", "PCM2", "PCM2", "PCM2~
$ GROWING_PROGRAM_REF_ID         <chr> "TD", "TD", "TD", "TD", "TD", "TD", "TD", "TD", "TD", "TD", "TD", "TD~
$ PIPELINE_NAME                  <chr> "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR", "BR~
$ TEST_SET_SEASON                <chr> "WET", "WET", "WET", "WET", "WET", "WET", "WET", "WET", "WET", "WET",~
$ ME                             <chr> "ME1", "ME1", "ME1", "ME1", "ME1", "ME1", "ME1", "ME1", "ME1", "ME1",~
$ COUNTRY_NAME                   <chr> "India", "India", "India", "India", "India", "India", "India", "India~
$ REP                            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
$ SUB_COUNTRY_NAME               <chr> "Maharashtra", "Maharashtra", "Maharashtra", "Maharashtra", "Maharash~
$ LOCATION_NAME                  <chr> "Kalwan, MH, IND", "Kalwan, MH, IND", "Kalwan, MH, IND", "Kalwan, MH,~
$ FIELD_NAME...sub.site          <chr> "KWN2", "KWN2", "KWN2", "KWN2", "KWN2", "KWN2", "KWN2", "KWN2", "KWN2~
$ IS_IRRIGATED                   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,~
$ PLANTING_DATE                  <chr> "6/27/2017 0:00", "6/27/2017 0:00", "6/27/2017 0:00", "6/27/2017 0:00~
$ PLANTING_MONTH                 <chr> "JUN", "JUN", "JUN", "JUN", "JUN", "JUN", "JUN", "JUN", "JUN", "JUN",~
$ HARVEST_DATE                   <chr> "11/16/2017 0:00", "11/16/2017 0:00", "11/16/2017 0:00", "11/16/2017 ~
$ PRODUCT_NAME                   <chr> "P3401", "DKC9141", "NK6240", "DKC9141", "DKC9133", "P3401", "DKC9207~
$ HARVEST_PLOT_LENGTH            <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ~
$ HARVEST_PLOT_WIDTH             <dbl> 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2,~
$ HARVEST.AREA                   <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ~
$ max_temperature                <dbl> 82.65315, 82.65315, 82.65315, 82.65315, 82.65315, 82.65315, 82.65315,~
$ min_temperature                <dbl> 71.37832, 71.37832, 71.37832, 71.37832, 71.37832, 71.37832, 71.37832,~
$ max_dew_point_temperature      <dbl> 72.30490, 72.30490, 72.30490, 72.30490, 72.30490, 72.30490, 72.30490,~
$ min_dew_point_temperature      <dbl> 68.41189, 68.41189, 68.41189, 68.41189, 68.41189, 68.41189, 68.41189,~
$ avg_dew_point_temperature      <dbl> 70.39301, 70.39301, 70.39301, 70.39301, 70.39301, 70.39301, 70.39301,~
$ total_precipitation            <dbl> 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.~
$ max_wind_speed                 <dbl> 10.76853, 10.76853, 10.76853, 10.76853, 10.76853, 10.76853, 10.76853,~
$ min_wind_speed                 <dbl> 5.840559, 5.840559, 5.840559, 5.840559, 5.840559, 5.840559, 5.840559,~
$ avg_wind_speed                 <dbl> 8.132168, 8.132168, 8.132168, 8.132168, 8.132168, 8.132168, 8.132168,~
$ avg_wind_direction             <dbl> 232.0839, 232.0839, 232.0839, 232.0839, 232.0839, 232.0839, 232.0839,~
$ max_wind_gust                  <dbl> 24.81399, 24.81399, 24.81399, 24.81399, 24.81399, 24.81399, 24.81399,~
$ max_relative_humidity          <dbl> 94.79161, 94.79161, 94.79161, 94.79161, 94.79161, 94.79161, 94.79161,~
$ min_relative_humidity          <dbl> 67.26643, 67.26643, 67.26643, 67.26643, 67.26643, 67.26643, 67.26643,~
$ avg_relative_humidity          <dbl> 83.84895, 83.84895, 83.84895, 83.84895, 83.84895, 83.84895, 83.84895,~
$ total_downward_solar_radiation <dbl> 2175.196, 2175.196, 2175.196, 2175.196, 2175.196, 2175.196, 2175.196,~
$ max_downward_solar_radiation   <dbl> 334.229, 334.229, 334.229, 334.229, 334.229, 334.229, 334.229, 334.22~
$ total_net_solar_radiation      <dbl> 1825.682, 1825.682, 1825.682, 1825.682, 1825.682, 1825.682, 1825.682,~
$ min_atmospheric_pressure       <dbl> 915.9, 915.9, 915.9, 915.9, 915.9, 915.9, 915.9, 915.9, 915.9, 915.9,~
$ avg_total_cloud_cover          <dbl> 115.41, 115.41, 115.41, 115.41, 115.41, 115.41, 115.41, 115.41, 115.4~
$ avg_snow_depth                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ avg_snow_density               <dbl> 234.377, 234.377, 234.377, 234.377, 234.377, 234.377, 234.377, 234.37~
$ max_soil_temperature_level_1   <dbl> 81.92098, 81.92098, 81.92098, 81.92098, 81.92098, 81.92098, 81.92098,~
$ max_soil_temperature_level_2   <dbl> 77.71469, 77.71469, 77.71469, 77.71469, 77.71469, 77.71469, 77.71469,~
$ max_soil_temperature_level_3   <dbl> 77.13916, 77.13916, 77.13916, 77.13916, 77.13916, 77.13916, 77.13916,~
$ max_soil_temperature_level_4   <dbl> 78.12448, 78.12448, 78.12448, 78.12448, 78.12448, 78.12448, 78.12448,~
$ min_soil_temperature_level_1   <dbl> 73.46294, 73.46294, 73.46294, 73.46294, 73.46294, 73.46294, 73.46294,~
$ min_soil_temperature_level_2   <dbl> 75.03007, 75.03007, 75.03007, 75.03007, 75.03007, 75.03007, 75.03007,~
$ min_soil_temperature_level_3   <dbl> 76.98951, 76.98951, 76.98951, 76.98951, 76.98951, 76.98951, 76.98951,~
$ min_soil_temperature_level_4   <dbl> 78.09580, 78.09580, 78.09580, 78.09580, 78.09580, 78.09580, 78.09580,~
$ avg_soil_temperature_level_1   <dbl> 76.96503, 76.96503, 76.96503, 76.96503, 76.96503, 76.96503, 76.96503,~
$ avg_soil_temperature_level_2   <dbl> 76.34965, 76.34965, 76.34965, 76.34965, 76.34965, 76.34965, 76.34965,~
$ avg_soil_temperature_level_3   <dbl> 77.04685, 77.04685, 77.04685, 77.04685, 77.04685, 77.04685, 77.04685,~
$ avg_soil_temperature_level_4   <dbl> 78.11329, 78.11329, 78.11329, 78.11329, 78.11329, 78.11329, 78.11329,~
$ avg_soil_moisture_level_1      <dbl> 53.87, 53.87, 53.87, 53.87, 53.87, 53.87, 53.87, 53.87, 53.87, 53.87,~
$ avg_soil_moisture_level_2      <dbl> 53.23, 53.23, 53.23, 53.23, 53.23, 53.23, 53.23, 53.23, 53.23, 53.23,~
$ avg_soil_moisture_level_3      <dbl> 49.96, 49.96, 49.96, 49.96, 49.96, 49.96, 49.96, 49.96, 49.96, 49.96,~
$ avg_soil_moisture_level_4      <dbl> 50.36, 50.36, 50.36, 50.36, 50.36, 50.36, 50.36, 50.36, 50.36, 50.36,~
$ Prod                           <dbl> 142, 145, 122, 158, 118, 128, 140, 139, 97, 119, 111, 122, 138, 131, ~
datatest %>% glimpse
Rows: 2,958
Columns: 57
$ TEST_SET_NAME                  <chr> "4", "4", "9", "9", "9", "9", "1", "18", "4", "3", "4", "3", "3", "3"~
$ EXPERIMENT_STAGE_NAME          <chr> "MD", "MD", "MD", "MD", "MD", "MD", "MD", "MD", "MD", "MD", "MD", "MD~
$ GROWING_PROGRAM_REF_ID         <chr> "AT5", "AT5", "DP1", "DP1", "DP1", "DP1", "FO9", "AT5", "AT5", "AR7",~
$ PIPELINE_NAME                  <chr> "MD", "MD", "MD", "MD", "MD", "MD", "MD", "MD", "MD", "MD", "MD", "MD~
$ TEST_SET_SEASON                <chr> "WET", "WET", "WET", "WET", "WET", "WET", "WET", "WET", "WET", "WET",~
$ ME                             <chr> "ME1", "ME1", "ME1", "ME1", "ME1", "ME1", "ME1", "ME1", "ME1", "ME1",~
$ COUNTRY_NAME                   <chr> "India", "India", "India", "India", "India", "India", "India", "India~
$ REP                            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
$ SUB_COUNTRY_NAME               <chr> "Karnataka", "Karnataka", "Maharashtra", "Maharashtra", "Maharashtra"~
$ LOCATION_NAME                  <chr> "Chitradurga, KA, IND", "Chitradurga, KA, IND", "Nandgaon, MH, IND", ~
$ FIELD_NAME...sub.site          <chr> "DYPL", "DYPL", "L9", "L9", "L9", "L9", "KALA", "SDPR", "DYPL", "23",~
$ IS_IRRIGATED                   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,~
$ PLANTING_DATE                  <chr> "06/07/2017 00:00", "06/07/2017 00:00", "07/03/2019 00:00", "07/03/20~
$ PLANTING_MONTH                 <chr> "JUN", "JUN", "JUL", "JUL", "JUL", "JUL", "JUN", "MAY", "JUN", "JUN",~
$ HARVEST_DATE                   <chr> "11/19/2017 0:00", "11/19/2017 0:00", "12/15/2019 0:00", "12/15/2019 ~
$ PRODUCT_NAME                   <chr> "DKC9141", "P3550", "P3501", "NK6240", "PAC751", "P3401", "P3401", "S~
$ HARVEST_PLOT_LENGTH            <dbl> 25.0, 25.0, 42.0, 42.0, 42.0, 42.0, 60.0, 25.0, 25.0, 8.1, 25.0, 8.1,~
$ HARVEST_PLOT_WIDTH             <dbl> 20.0, 20.0, 12.0, 12.0, 12.0, 12.0, 8.1, 20.0, 20.0, 60.0, 20.0, 60.0~
$ HARVEST.AREA                   <dbl> 500.0, 500.0, 504.0, 504.0, 504.0, 504.0, 486.0, 500.0, 500.0, 486.0,~
$ max_temperature                <dbl> 83.83989, 83.83989, 84.58757, 84.58757, 84.58757, 84.58757, 84.51326,~
$ min_temperature                <dbl> 70.88415, 70.88415, 71.44438, 71.44438, 71.44438, 71.44438, 71.45691,~
$ max_dew_point_temperature      <dbl> 71.11148, 71.11148, 71.79172, 71.79172, 71.79172, 71.79172, 71.64696,~
$ min_dew_point_temperature      <dbl> 66.65519, 66.65519, 67.81834, 67.81834, 67.81834, 67.81834, 66.95193,~
$ avg_dew_point_temperature      <dbl> 68.82896, 68.82896, 69.72012, 69.72012, 69.72012, 69.72012, 69.21768,~
$ total_precipitation            <dbl> 32.7, 32.7, 31.0, 31.0, 31.0, 31.0, 59.4, 37.8, 32.7, 22.9, 32.7, 22.~
$ max_wind_speed                 <dbl> 12.281967, 12.281967, 9.434320, 9.434320, 9.434320, 9.434320, 7.97458~
$ min_wind_speed                 <dbl> 6.489617, 6.489617, 3.905325, 3.905325, 3.905325, 3.905325, 3.171823,~
$ avg_wind_speed                 <dbl> 9.204918, 9.204918, 6.414201, 6.414201, 6.414201, 6.414201, 5.489503,~
$ avg_wind_direction             <dbl> 223.04918, 223.04918, 189.89941, 189.89941, 189.89941, 189.89941, 191~
$ max_wind_gust                  <dbl> 24.78197, 24.78197, 21.15207, 21.15207, 21.15207, 21.15207, 17.79503,~
$ max_relative_humidity          <dbl> 91.94645, 91.94645, 91.26095, 91.26095, 91.26095, 91.26095, 89.62873,~
$ min_relative_humidity          <dbl> 60.32678, 60.32678, 61.69704, 61.69704, 61.69704, 61.69704, 63.12928,~
$ avg_relative_humidity          <dbl> 78.86557, 78.86557, 79.04438, 79.04438, 79.04438, 79.04438, 78.21215,~
$ total_downward_solar_radiation <dbl> 2912.047, 2912.047, 2385.952, 2385.952, 2385.952, 2385.952, 2508.140,~
$ max_downward_solar_radiation   <dbl> 453.655, 453.655, 377.006, 377.006, 377.006, 377.006, 395.395, 381.74~
$ total_net_solar_radiation      <dbl> 2338.863, 2338.863, 2014.779, 2014.779, 2014.779, 2014.779, 2111.964,~
$ min_atmospheric_pressure       <dbl> 926.5, 926.5, 936.5, 936.5, 936.5, 936.5, 929.7, 902.5, 926.5, 944.1,~
$ avg_total_cloud_cover          <dbl> 156.12, 156.12, 121.00, 121.00, 121.00, 121.00, 122.13, 147.87, 156.1~
$ avg_snow_depth                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ avg_snow_density               <dbl> 299.937, 299.937, 276.991, 276.991, 276.991, 276.991, 296.659, 262.24~
$ max_soil_temperature_level_1   <dbl> 83.41038, 83.41038, 82.63314, 82.63314, 82.63314, 82.63314, 83.64586,~
$ max_soil_temperature_level_2   <dbl> 79.14863, 79.14863, 78.86331, 78.86331, 78.86331, 78.86331, 79.56630,~
$ max_soil_temperature_level_3   <dbl> 79.09891, 79.09891, 79.43432, 79.43432, 79.43432, 79.43432, 80.51492,~
$ max_soil_temperature_level_4   <dbl> 81.19071, 81.19071, 82.39053, 82.39053, 82.39053, 82.39053, 83.41768,~
$ min_soil_temperature_level_1   <dbl> 74.74372, 74.74372, 74.45976, 74.45976, 74.45976, 74.45976, 73.93978,~
$ min_soil_temperature_level_2   <dbl> 76.72240, 76.72240, 76.69645, 76.69645, 76.69645, 76.69645, 77.22376,~
$ min_soil_temperature_level_3   <dbl> 78.95246, 78.95246, 79.30178, 79.30178, 79.30178, 79.30178, 80.33425,~
$ min_soil_temperature_level_4   <dbl> 81.14809, 81.14809, 82.33314, 82.33314, 82.33314, 82.33314, 83.35193,~
$ avg_soil_temperature_level_1   <dbl> 78.47049, 78.47049, 78.16391, 78.16391, 78.16391, 78.16391, 78.27293,~
$ avg_soil_temperature_level_2   <dbl> 77.93497, 77.93497, 77.78462, 77.78462, 77.78462, 77.78462, 78.39282,~
$ avg_soil_temperature_level_3   <dbl> 79.00984, 79.00984, 79.35680, 79.35680, 79.35680, 79.35680, 80.41215,~
$ avg_soil_temperature_level_4   <dbl> 81.16885, 81.16885, 82.35858, 82.35858, 82.35858, 82.35858, 83.38287,~
$ avg_soil_moisture_level_1      <dbl> 58.56, 58.56, 71.02, 71.02, 71.02, 71.02, 77.88, 71.33, 58.56, 41.51,~
$ avg_soil_moisture_level_2      <dbl> 56.38, 56.38, 71.97, 71.97, 71.97, 71.97, 78.09, 69.89, 56.38, 40.62,~
$ avg_soil_moisture_level_3      <dbl> 46.23, 46.23, 69.39, 69.39, 69.39, 69.39, 77.95, 64.47, 46.23, 37.75,~
$ avg_soil_moisture_level_4      <dbl> 43.17, 43.17, 53.27, 53.27, 53.27, 53.27, 84.69, 60.59, 43.17, 41.51,~
$ Prod                           <dbl> 51.61856, 50.48400, 66.06287, 65.96983, 60.39869, 55.36528, 45.14597,~

Primeiras linhas do datatrain


datatrain %>% head
NA

Primeiras linhas do datatest

datatest %>% head

Padronizar coluna de datas do arquivo treino

Padronizar coluna de datas do arquivo teste


datatest <- datatest %>% mutate(HARVEST_DATE = substr(HARVEST_DATE, start = 1, stop = 10),
                                  HARVEST_DATE = parse_date_time(HARVEST_DATE, orders="mdy"))


datatest <- datatest %>% mutate(PLANTING_DATE = substr(PLANTING_DATE, start = 1, stop = 10),
                                  PLANTING_DATE = parse_date_time(PLANTING_DATE, orders="mdy"))

Criando a variavel DAP (Dias após plantio)


datatrain %>% ggboxplot(y="DAP", xlab = " ",title = "Treino", fill = "red", bxp.errorbar = T)+coord_flip()


summary(datatrain$DAP)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   61.0   122.0   132.0   132.2   141.0   218.0 

datatest %>% ggboxplot(y="DAP", xlab=" ",ylab = "DAP",title = "Teste", fill = "red", bxp.errorbar = T)+coord_flip()


summary(datatest$DAP)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   98.0   118.0   132.0   133.9   146.0   198.0 

Transformar variaveis que são caracteres em fatores


datatrain <- datatrain  %>% 
             mutate_if(is.character, as.factor) %>% 
             as_tibble() 


datatest <- datatest  %>% 
            mutate_if(is.character, as.factor) %>% 
            as_tibble()


##############################################################################




datatrain %>% glimpse()
Rows: 16,440
Columns: 58
$ TEST_SET_NAME                  <fct> NCB_BIA005, NCB_BIA001, NCB_BIA002, NCB_BIA005, NCB_BIA005, NCB_BIA00~
$ EXPERIMENT_STAGE_NAME          <fct> PCM2, PCM2, PCM2, PCM2, PCM2, PCM2, PCM2, PCM2, PCM2, PCM2, PCM2, PCM~
$ GROWING_PROGRAM_REF_ID         <fct> TD, TD, TD, TD, TD, TD, TD, TD, TD, TD, TD, TD, TD, TD, TD, TD, TD, T~
$ PIPELINE_NAME                  <fct> BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, B~
$ TEST_SET_SEASON                <fct> WET, WET, WET, WET, WET, WET, WET, WET, WET, WET, WET, WET, WET, WET,~
$ ME                             <fct> ME1, ME1, ME1, ME1, ME1, ME1, ME1, ME1, ME1, ME1, ME1, ME1, ME1, ME1,~
$ COUNTRY_NAME                   <fct> India, India, India, India, India, India, India, India, India, India,~
$ REP                            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
$ SUB_COUNTRY_NAME               <fct> Maharashtra, Maharashtra, Maharashtra, Maharashtra, Maharashtra, Maha~
$ LOCATION_NAME                  <fct> "Kalwan, MH, IND", "Kalwan, MH, IND", "Kalwan, MH, IND", "Kalwan, MH,~
$ FIELD_NAME...sub.site          <fct> KWN2, KWN2, KWN2, KWN2, KWN2, KWN2, KWN2, KWN2, KWN2, KWN2, KWN2, KWN~
$ IS_IRRIGATED                   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,~
$ date1                          <dttm> 2017-06-27, 2017-06-27, 2017-06-27, 2017-06-27, 2017-06-27, 2017-06-~
$ PLANTING_MONTH                 <fct> JUN, JUN, JUN, JUN, JUN, JUN, JUN, JUN, JUN, JUN, JUN, JUN, JUN, JUN,~
$ date2                          <dttm> 2017-11-16, 2017-11-16, 2017-11-16, 2017-11-16, 2017-11-16, 2017-11-~
$ PRODUCT_NAME                   <fct> P3401, DKC9141, NK6240, DKC9141, DKC9133, P3401, DKC9207, P3501, DKC8~
$ HARVEST_PLOT_LENGTH            <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ~
$ HARVEST_PLOT_WIDTH             <dbl> 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2,~
$ HARVEST.AREA                   <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ~
$ max_temperature                <dbl> 82.65315, 82.65315, 82.65315, 82.65315, 82.65315, 82.65315, 82.65315,~
$ min_temperature                <dbl> 71.37832, 71.37832, 71.37832, 71.37832, 71.37832, 71.37832, 71.37832,~
$ max_dew_point_temperature      <dbl> 72.30490, 72.30490, 72.30490, 72.30490, 72.30490, 72.30490, 72.30490,~
$ min_dew_point_temperature      <dbl> 68.41189, 68.41189, 68.41189, 68.41189, 68.41189, 68.41189, 68.41189,~
$ avg_dew_point_temperature      <dbl> 70.39301, 70.39301, 70.39301, 70.39301, 70.39301, 70.39301, 70.39301,~
$ total_precipitation            <dbl> 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.~
$ max_wind_speed                 <dbl> 10.76853, 10.76853, 10.76853, 10.76853, 10.76853, 10.76853, 10.76853,~
$ min_wind_speed                 <dbl> 5.840559, 5.840559, 5.840559, 5.840559, 5.840559, 5.840559, 5.840559,~
$ avg_wind_speed                 <dbl> 8.132168, 8.132168, 8.132168, 8.132168, 8.132168, 8.132168, 8.132168,~
$ avg_wind_direction             <dbl> 232.0839, 232.0839, 232.0839, 232.0839, 232.0839, 232.0839, 232.0839,~
$ max_wind_gust                  <dbl> 24.81399, 24.81399, 24.81399, 24.81399, 24.81399, 24.81399, 24.81399,~
$ max_relative_humidity          <dbl> 94.79161, 94.79161, 94.79161, 94.79161, 94.79161, 94.79161, 94.79161,~
$ min_relative_humidity          <dbl> 67.26643, 67.26643, 67.26643, 67.26643, 67.26643, 67.26643, 67.26643,~
$ avg_relative_humidity          <dbl> 83.84895, 83.84895, 83.84895, 83.84895, 83.84895, 83.84895, 83.84895,~
$ total_downward_solar_radiation <dbl> 2175.196, 2175.196, 2175.196, 2175.196, 2175.196, 2175.196, 2175.196,~
$ max_downward_solar_radiation   <dbl> 334.229, 334.229, 334.229, 334.229, 334.229, 334.229, 334.229, 334.22~
$ total_net_solar_radiation      <dbl> 1825.682, 1825.682, 1825.682, 1825.682, 1825.682, 1825.682, 1825.682,~
$ min_atmospheric_pressure       <dbl> 915.9, 915.9, 915.9, 915.9, 915.9, 915.9, 915.9, 915.9, 915.9, 915.9,~
$ avg_total_cloud_cover          <dbl> 115.41, 115.41, 115.41, 115.41, 115.41, 115.41, 115.41, 115.41, 115.4~
$ avg_snow_depth                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ avg_snow_density               <dbl> 234.377, 234.377, 234.377, 234.377, 234.377, 234.377, 234.377, 234.37~
$ max_soil_temperature_level_1   <dbl> 81.92098, 81.92098, 81.92098, 81.92098, 81.92098, 81.92098, 81.92098,~
$ max_soil_temperature_level_2   <dbl> 77.71469, 77.71469, 77.71469, 77.71469, 77.71469, 77.71469, 77.71469,~
$ max_soil_temperature_level_3   <dbl> 77.13916, 77.13916, 77.13916, 77.13916, 77.13916, 77.13916, 77.13916,~
$ max_soil_temperature_level_4   <dbl> 78.12448, 78.12448, 78.12448, 78.12448, 78.12448, 78.12448, 78.12448,~
$ min_soil_temperature_level_1   <dbl> 73.46294, 73.46294, 73.46294, 73.46294, 73.46294, 73.46294, 73.46294,~
$ min_soil_temperature_level_2   <dbl> 75.03007, 75.03007, 75.03007, 75.03007, 75.03007, 75.03007, 75.03007,~
$ min_soil_temperature_level_3   <dbl> 76.98951, 76.98951, 76.98951, 76.98951, 76.98951, 76.98951, 76.98951,~
$ min_soil_temperature_level_4   <dbl> 78.09580, 78.09580, 78.09580, 78.09580, 78.09580, 78.09580, 78.09580,~
$ avg_soil_temperature_level_1   <dbl> 76.96503, 76.96503, 76.96503, 76.96503, 76.96503, 76.96503, 76.96503,~
$ avg_soil_temperature_level_2   <dbl> 76.34965, 76.34965, 76.34965, 76.34965, 76.34965, 76.34965, 76.34965,~
$ avg_soil_temperature_level_3   <dbl> 77.04685, 77.04685, 77.04685, 77.04685, 77.04685, 77.04685, 77.04685,~
$ avg_soil_temperature_level_4   <dbl> 78.11329, 78.11329, 78.11329, 78.11329, 78.11329, 78.11329, 78.11329,~
$ avg_soil_moisture_level_1      <dbl> 53.87, 53.87, 53.87, 53.87, 53.87, 53.87, 53.87, 53.87, 53.87, 53.87,~
$ avg_soil_moisture_level_2      <dbl> 53.23, 53.23, 53.23, 53.23, 53.23, 53.23, 53.23, 53.23, 53.23, 53.23,~
$ avg_soil_moisture_level_3      <dbl> 49.96, 49.96, 49.96, 49.96, 49.96, 49.96, 49.96, 49.96, 49.96, 49.96,~
$ avg_soil_moisture_level_4      <dbl> 50.36, 50.36, 50.36, 50.36, 50.36, 50.36, 50.36, 50.36, 50.36, 50.36,~
$ Prod                           <dbl> 142, 145, 122, 158, 118, 128, 140, 139, 97, 119, 111, 122, 138, 131, ~
$ DAP                            <dbl> 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,~

Removendo diferenƧa nos nomes dos locais e produtos


datatrain<- datatrain %>% 
  separate(LOCATION_NAME, c("LOCATION_NAME","A","B"), sep = ",") %>% 
  select(-c(A,B)) %>% 
  mutate(LOCATION_NAME=as.factor(LOCATION_NAME))

datatest <- datatest %>%
  separate(LOCATION_NAME, c("LOCATION_NAME","A","B"), sep = ",") %>% 
  select(-c(A,B)) %>% mutate(LOCATION_NAME=as.factor(LOCATION_NAME))


datatrain$PRODUCT_NAME <- str_replace_all(datatrain$PRODUCT_NAME, "[^[:alnum:]]", "")
datatrain$PRODUCT_NAME <- as.factor(datatrain$PRODUCT_NAME)

datatest$PRODUCT_NAME <- str_replace_all(datatest$PRODUCT_NAME, "[^[:alnum:]]", "")
datatest$PRODUCT_NAME <- as.factor(datatest$PRODUCT_NAME)

Verificando colunas com NAs


gg_miss_fct(x = datatrain, fct = PLANTING_MONTH)+ggtitle("Train")


gg_miss_fct(x = datatest, fct = PLANTING_MONTH)+ggtitle("Test")

Criando modelos de imputação para comparação


datatrain_imp <- datatrain %>% impute_median_if(is.numeric) %>% 
  bind_shadow() %>%   
  add_label_shadow()


datatrain_imp_mean <- datatrain %>% impute_mean_if(is.numeric) %>% 
  bind_shadow() %>%   
  add_label_shadow()


datatrain_imp_rf <- datatrain %>% 
  bind_shadow() %>%
  impute_rf(total_precipitation ~ PLANTING_MONTH+PIPELINE_NAME  +EXPERIMENT_STAGE_NAME+date1) %>% 
  impute_rf(avg_relative_humidity ~ PLANTING_MONTH+PIPELINE_NAME  +EXPERIMENT_STAGE_NAME+date1) %>% 
  add_label_shadow()

Unir os modelos


bound_models <- bind_rows(imp_median = datatrain_imp,
                          imp_mean = datatrain_imp_mean,
                          imp_rf = datatrain_imp_rf,
                          .id = "imp_model")

Plotando os modelos


p1<-ggplot(bound_models, 
       aes(x = total_precipitation, 
           y = avg_relative_humidity, 
           color = any_missing)) + 
  geom_point(shape=21, size=4, alpha=0.5) + 
  facet_wrap(~imp_model, nrow = 5) +
  #scale_color_manual(values = c("black","gray"))+
  theme_bw()

p2<-ggplot(bound_models, 
       aes(x = total_precipitation, 
           y = avg_relative_humidity, 
           color = any_missing)) + 
  geom_point(shape=21, size=4, alpha=0.5) + 
  #scale_color_manual(values = c("black","gray"))+
  facet_wrap(~PLANTING_MONTH+imp_model, nrow = 5) +
  theme_bw()

p1


p2

Selecionando o modelo

selectmodel %>% pivot_longer(cols=1:2) %>% 
  ggplot(aes(x = value)) + 
  geom_histogram() + 
  facet_wrap(~name)+theme_bw()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Verificando média e desvio padrão da precipitação no arquivo de teste


datatest %>% 
  group_by(PLANTING_MONTH) %>% 
  get_summary_stats(total_precipitation,type="mean_sd")
NA

Verificando média e desvio padrão da precipitação no arquivo de treino


datatrain_input %>% 
  group_by(PLANTING_MONTH) %>% 
  get_summary_stats(total_precipitation,type="mean_sd")
NA

Identificando colunas com menos de 2 fatores


cols = colnames(datatrain_input)

for (col in cols){
  if(is.factor(datatrain_input[[col]])){
    cat(col, 'tem', length(levels(datatrain_input[[col]])), 'fatores \n')
  }
}
TEST_SET_NAME tem 658 fatores 
EXPERIMENT_STAGE_NAME tem 5 fatores 
GROWING_PROGRAM_REF_ID tem 20 fatores 
PIPELINE_NAME tem 2 fatores 
TEST_SET_SEASON tem 1 fatores 
ME tem 1 fatores 
COUNTRY_NAME tem 1 fatores 
SUB_COUNTRY_NAME tem 7 fatores 
LOCATION_NAME tem 235 fatores 
FIELD_NAME...sub.site tem 984 fatores 
PLANTING_MONTH tem 5 fatores 
PRODUCT_NAME tem 111 fatores 

Apagando variaveis que vao atrapalhar no modelo


datatrain_input = datatrain_input %>%  select(-c(COUNTRY_NAME,TEST_SET_SEASON,ME,date1,date2))
datatest = datatest %>%  select(-c(COUNTRY_NAME,TEST_SET_SEASON,ME,date1,date2))

Identificação de preditores de variância quase zero

Criando novos dataframes (teste e treino) apenas com as variaveis importantes para o modelo


datatrain_input_2 <- datatrain_input[, -nzv[-c(5,8)]]

datatest_input <- datatest[, -nzv[-c(5,8)]]

dim(datatrain_input)
[1] 16440    53
dim(datatrain_input_2)
[1] 16440    11
dim(datatest)
[1] 2958   53
dim(datatest_input)
[1] 2958   11

Renomeando o cabeƧalho e criando o novo arquivo de treino com os nomes lowercase

train <- datatrain_input_2 %>% janitor::clean_names()
test <- datatest_input %>% janitor::clean_names()

Possuem o mesmo numero de linhas e colunas?


dim(train)==dim(test)
[1] FALSE  TRUE

As colunas possuem os mesmos nomes em ambos dataframes?

names(train) == names(test)
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

Quais são as variaveis?

names(train)
 [1] "experiment_stage_name" "sub_country_name"      "location_name"        
 [4] "field_name_sub_site"   "is_irrigated"          "planting_month"       
 [7] "product_name"          "harvest_area"          "total_precipitation"  
[10] "prod"                  "dap"                  

Iniciando o H2O


h2o.init(max_mem_size = "8g")
 Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         15 hours 5 minutes 
    H2O cluster timezone:       America/Bahia 
    H2O data parsing timezone:  UTC 
    H2O cluster version:        3.32.1.3 
    H2O cluster version age:    2 months and 9 days  
    H2O cluster name:           H2O_started_from_R_cided_bqt869 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   6.93 GB 
    H2O cluster total cores:    4 
    H2O cluster allowed cores:  4 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    H2O Internal Security:      FALSE 
    H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
    R Version:                  R version 4.1.0 (2021-05-18) 
gbm1 <- h2o.gbm(y = dependent,
                x = independent,
                training_frame = train_h2o,
                validation_frame = test_h2o,
                ntrees = 120,
                nfolds = 10,
                learn_rate = 0.3,
                score_each_iteration = TRUE, 
                score_tree_interval = 10,
                seed = 1234)

  |                                                                                    
  |                                                                              |   0%
  |                                                                                    
  |=====                                                                         |   6%
  |                                                                                    
  |=========                                                                     |  11%
  |                                                                                    
  |==============                                                                |  18%
  |                                                                                    
  |=====================                                                         |  27%
  |                                                                                    
  |==========================                                                    |  34%
  |                                                                                    
  |======================================                                        |  48%
  |                                                                                    
  |======================================                                        |  49%
  |                                                                                    
  |=========================================================                     |  73%
  |                                                                                    
  |=============================================================                 |  79%
  |                                                                                    
  |====================================================================          |  87%
  |                                                                                    
  |=====================================================================         |  89%
  |                                                                                    
  |=======================================================================       |  91%
  |                                                                                    
  |========================================================================      |  92%
  |                                                                                    
  |==========================================================================    |  95%
  |                                                                                    
  |============================================================================= |  98%
  |                                                                                    
  |==============================================================================| 100%

Plots


plot(gbm1, timestep = "number_of_trees", metric = "rmse")

plot(gbm1, timestep = "number_of_trees", metric = "mae")


h2o.scoreHistory(gbm1) %>%
  select(number_of_trees, training_rmse, validation_rmse) %>%
  pivot_longer(cols = -1) %>%
  mutate(a = 1) %>%
  mutate(name=ifelse(name=="training_rmse","Training","Validation")) %>% 
  ggplot(aes(x = number_of_trees,
             y = value,
             col = name)) +
  geom_line(size=1) +
  theme_test(15) +
  xlab("Number of Trees") + ylab("Root Mean Squared Error (RMSE)") +
  scale_color_manual(values = c("blue", "gold2"))+
  ggtitle("Scoring History") +
  theme(legend.position = c(0.90,0.88),
        legend.title = element_blank(),
        plot.title = element_text(hjust = 0.5))

NA
NA

SumƔrio com as mƩtricas dos modelos


h2o.performance(gbm1, valid = T)
H2ORegressionMetrics: gbm
** Reported on validation data. **

MSE:  2.171274
RMSE:  1.473524
MAE:  0.6627271
RMSLE:  0.02875685
Mean Residual Deviance :  2.171274

Aplicando o modelo escolhido no banco de dados de teste


predictions<-as.data.frame(h2o.predict(gbm1,test_h2o))

  |                                                                                    
  |                                                                              |   0%
  |                                                                                    
  |==============================================================================| 100%
pred <- predictions[,1]

testando <- cbind(test,pred)

Correlação de Pearson

cor(testando$prod,testando$pred)
[1] 0.997084

Root Mean Squared Error (RMSE)

error_mod <-  testando$prod-testando$pred

# Function for Root Mean Squared Error
RMSE <- function(x) { sqrt(mean(x^2)) }

rmse_mod <- RMSE(error_mod)

rmse_mod
[1] 1.473524

Mean Absolute Error (MAE)

# Function for Mean Absolute Error
mae <- function(x) { mean(abs(x)) }
mae_mod <- mae(error_mod)

mae_mod
[1] 0.662727

R2

# R2

SQt = sum((mean(testando$prod) - testando$prod)^2)
SQres = sum((testando$pred - testando$prod)^2)
R2 = (SQt - SQres) / SQt
R2
[1] 0.9940594

R2.adj

# R2.adj

obs <- nrow(testando)
k <- ncol(testando)-1

R2.adj <- 1-((1-R2)*(obs-1)/(obs-k-1))
R2.adj
[1] 0.9940372

Histogram of residuals


a<-gghistostats(data.frame(error_mod),
                x = error_mod,
                xlab = "Residuals",
                title = "Histogram of residuals",
                 type = "p",
                normal.curve = T,
                normal.curve.args = list(size = 1))
a

GrÔfico de correlação


b<-ggplot(testando, aes(x = pred, y = prod)) +
  geom_point(alpha=0.1, size=4,col="coral2") +
  geom_smooth(method = "lm", se = T, col="black", linetype=2, size=0.7)+
  stat_cor(method="pearson",  size=5,r.digits = 3, col="red")+
  ylab("Prod")+
  xlab("Predict")+
  theme_bw()+
  geom_rug(col="goldenrod1", alpha=0.2)+
  ggtitle(paste("Relationship between prod and predict","\nn =",obs,"observations"))+
  labs(subtitle = paste("RMSE =",round(rmse_mod,3),"MAE =",round(mae_mod,3),
                        "\nR² =",round(R2,5),"R².adj =",
                        round(R2.adj,5)),
       caption = "H2O Regression Model: Gradient Boosting Machine\n10-fold cross-validation\nDatabase: EAC")

b
`geom_smooth()` using formula 'y ~ x'

---
title: "Desafio Master"
author: "Cid Póvoas"
output:
  html_notebook:
    toc: yes
  html_document:
    keep_md: yes
    toc: yes
  word_document:
    toc: yes
  pdf_document:
    toc: yes
editor_options:
  chunk_output_type: inline
  
---

# Carregando pacotes

```{r setup, include=T}


if(!require("pacman")) install.packages("pacman")

pacman::p_load(tidyverse,
               lubridate,
               ggpubr,
               ggstatsplot,
               naniar,
               simputation,
               caret,
               janitor,
               knitr,
               h2o)

knitr::opts_chunk$set(echo = TRUE)

```

```{r setup1, include=FALSE}
my.path <- dirname(rstudioapi::getActiveDocumentContext()$path)
setwd(my.path)

```

# Carregar o banco de dados

```{r}

datatrain <- read.csv("dados_treinamento.csv")
datatest <- read.csv("dados_teste.csv")

```

# Estrutura dos dados

```{r}
datatrain %>% glimpse
```
```{r}
datatest %>% glimpse

```

# Primeiras linhas do datatrain

```{r}

datatrain %>% head

```

# Primeiras linhas do datatest

```{r}

datatest %>% head

```

# Padronizar coluna de datas do arquivo treino 

```{r}

datatrain %>% select(HARVEST_DATE) %>% head(10)

datatrain <- datatrain %>% mutate(HARVEST_DATE = substr(HARVEST_DATE, start = 1, stop = 10),
                                  HARVEST_DATE = parse_date_time(HARVEST_DATE, orders="mdy"))

datatrain %>% select(HARVEST_DATE) %>% head(10)


datatrain <- datatrain %>% mutate(PLANTING_DATE = substr(PLANTING_DATE, start = 1, stop = 10),
                                  PLANTING_DATE = parse_date_time(PLANTING_DATE, orders="mdy"))


```

# Padronizar coluna de datas do arquivo teste

```{r}

datatest <- datatest %>% mutate(HARVEST_DATE = substr(HARVEST_DATE, start = 1, stop = 10),
                                HARVEST_DATE = parse_date_time(HARVEST_DATE, orders="mdy"))


datatest <- datatest %>% mutate(PLANTING_DATE = substr(PLANTING_DATE, start = 1, stop = 10),
                                PLANTING_DATE = parse_date_time(PLANTING_DATE, orders="mdy"))


```


# Criando a variavel DAP (Dias após plantio)

```{r}


datatrain <- datatrain %>% rename(date1 = PLANTING_DATE,
                                  date2 = HARVEST_DATE)

datatrain <- datatrain %>% mutate(DAP=date2-date1,
                                  DAP=as.numeric(DAP))

datatest <- datatest %>% rename(date1 = PLANTING_DATE,
                                date2= HARVEST_DATE)






###########################################################
datatest <- datatest %>% mutate(DAP=date2-date1,
                                DAP=as.numeric(DAP))

```

```{r, fig.width=7, fig.height=4}

datatrain %>% ggboxplot(y="DAP", xlab = " ",title = "Treino", fill = "red", bxp.errorbar = T)+coord_flip()

summary(datatrain$DAP)
```


```{r}

datatest %>% ggboxplot(y="DAP", xlab=" ",ylab = "DAP",title = "Teste", fill = "red", bxp.errorbar = T)+coord_flip()

summary(datatest$DAP)
```

# Transformar variaveis que são caracteres em fatores

```{r}

datatrain <- datatrain  %>% 
             mutate_if(is.character, as.factor) %>% 
             as_tibble() 


datatest <- datatest  %>% 
            mutate_if(is.character, as.factor) %>% 
            as_tibble()


##############################################################################




datatrain %>% glimpse()

```



# Removendo diferença nos nomes dos locais e produtos

```{r, warning=FALSE, message=FALSE}

datatrain<- datatrain %>% 
  separate(LOCATION_NAME, c("LOCATION_NAME","A","B"), sep = ",") %>% 
  select(-c(A,B)) %>% 
  mutate(LOCATION_NAME=as.factor(LOCATION_NAME))

datatest <- datatest %>%
  separate(LOCATION_NAME, c("LOCATION_NAME","A","B"), sep = ",") %>% 
  select(-c(A,B)) %>% mutate(LOCATION_NAME=as.factor(LOCATION_NAME))


datatrain$PRODUCT_NAME <- str_replace_all(datatrain$PRODUCT_NAME, "[^[:alnum:]]", "")
datatrain$PRODUCT_NAME <- as.factor(datatrain$PRODUCT_NAME)

datatest$PRODUCT_NAME <- str_replace_all(datatest$PRODUCT_NAME, "[^[:alnum:]]", "")
datatest$PRODUCT_NAME <- as.factor(datatest$PRODUCT_NAME)

```


# Verificando colunas com NAs

```{r, fig.width=10,fig.height=10}

gg_miss_fct(x = datatrain, fct = PLANTING_MONTH)+ggtitle("Train")

gg_miss_fct(x = datatest, fct = PLANTING_MONTH)+ggtitle("Test")

```

# Criando modelos de imputação para comparação

```{r}

datatrain_imp <- datatrain %>% impute_median_if(is.numeric) %>% 
  bind_shadow() %>%   
  add_label_shadow()


datatrain_imp_mean <- datatrain %>% impute_mean_if(is.numeric) %>% 
  bind_shadow() %>%   
  add_label_shadow()


datatrain_imp_rf <- datatrain %>% 
  bind_shadow() %>%
  impute_rf(total_precipitation ~ PLANTING_MONTH+PIPELINE_NAME  +EXPERIMENT_STAGE_NAME+date1) %>% 
  impute_rf(avg_relative_humidity ~ PLANTING_MONTH+PIPELINE_NAME  +EXPERIMENT_STAGE_NAME+date1) %>% 
  add_label_shadow()


```

# Unir os modelos

```{r}

bound_models <- bind_rows(imp_median = datatrain_imp,
                          imp_mean = datatrain_imp_mean,
                          imp_rf = datatrain_imp_rf,
                          .id = "imp_model")
```


# Plotando os modelos

```{r, fig.width=15,fig.height=10}

p1<-ggplot(bound_models, 
       aes(x = total_precipitation, 
           y = avg_relative_humidity, 
           color = any_missing)) + 
  geom_point(shape=21, size=4, alpha=0.5) + 
  facet_wrap(~imp_model, nrow = 5) +
  #scale_color_manual(values = c("black","gray"))+
  theme_bw()

p2<-ggplot(bound_models, 
       aes(x = total_precipitation, 
           y = avg_relative_humidity, 
           color = any_missing)) + 
  geom_point(shape=21, size=4, alpha=0.5) + 
  #scale_color_manual(values = c("black","gray"))+
  facet_wrap(~PLANTING_MONTH+imp_model, nrow = 5) +
  theme_bw()

p1

p2

```

# Selecionando o modelo

```{r, warning=FALSE,message=F}

pc <- preProcess(datatrain, method = c("medianImpute"))


datatrain_input <- predict(pc, datatrain)


selectmodel <- data.frame(rf=datatrain_imp_rf$total_precipitation,
                          median=datatrain_input$total_precipitation)

selectmodel %>% pivot_longer(cols=1:2) %>% 
  ggplot(aes(x = value)) + 
  geom_histogram() + 
  facet_wrap(~name)+theme_bw()


datatrain_input$total_precipitation <-  datatrain_imp_rf$total_precipitation

```

# Verificando média e desvio padrão da precipitação no arquivo de teste

```{r}

datatest %>% 
  group_by(PLANTING_MONTH) %>% 
  get_summary_stats(total_precipitation,type="mean_sd")

```

# Verificando média e desvio padrão da precipitação no arquivo de treino

```{r}

datatrain_input %>% 
  group_by(PLANTING_MONTH) %>% 
  get_summary_stats(total_precipitation,type="mean_sd")

```

# Identificando colunas com menos de 2 fatores

```{r}

cols = colnames(datatrain_input)

for (col in cols){
  if(is.factor(datatrain_input[[col]])){
    cat(col, 'tem', length(levels(datatrain_input[[col]])), 'fatores \n')
  }
}

```

# Apagando variaveis que vao atrapalhar no modelo

```{r}

datatrain_input = datatrain_input %>%  select(-c(COUNTRY_NAME,TEST_SET_SEASON,ME,date1,date2))
datatest = datatest %>%  select(-c(COUNTRY_NAME,TEST_SET_SEASON,ME,date1,date2))

```


# Identificação de preditores de variância quase zero

```{r}

nzv <- nearZeroVar(datatrain_input, saveMetrics = TRUE, freqCut = 2, uniqueCut = 20)
nzv %>% filter(nzv==TRUE) %>% mutate(n=row_number(nzv)) 

```

# Criando novos dataframes (teste e treino) apenas com as variaveis importantes para o modelo

```{r}

nzv <- nearZeroVar(datatrain_input, freqCut = 2, uniqueCut = 20)

datatrain_input_2 <- datatrain_input[, -nzv[-c(5,8)]]

datatest_input <- datatest[, -nzv[-c(5,8)]]

dim(datatrain_input)
dim(datatrain_input_2)

dim(datatest)
dim(datatest_input)
```

# Renomeando o cabeçalho e criando o novo arquivo de treino com os nomes lowercase

```{r}
train <- datatrain_input_2 %>% janitor::clean_names()
test <- datatest_input %>% janitor::clean_names()
```

# Possuem o mesmo numero de linhas e colunas?

```{r}

dim(train)==dim(test)

```

# As colunas possuem os mesmos nomes em ambos dataframes?

```{r}
names(train) == names(test)
```

# Quais são as variaveis?

```{r}
names(train)
```


# Iniciando o H2O

```{r}

h2o.init(max_mem_size = "8g")

```


```{r}

train_h2o <- as.h2o(train)
test_h2o <- as.h2o(test)

dependent <- "prod"
independent <- setdiff(colnames(train_h2o), dependent)

gbm1 <- h2o.gbm(y = dependent,
                x = independent,
                training_frame = train_h2o,
                validation_frame = test_h2o,
                ntrees = 120,
                nfolds = 10,
                learn_rate = 0.3,
                score_each_iteration = TRUE, 
                score_tree_interval = 10,
                seed = 1234)


```

# Plots

```{r, warning=FALSE,message=FALSE,fig.height=6,fig.width=10}

plot(gbm1, timestep = "number_of_trees", metric = "rmse")
plot(gbm1, timestep = "number_of_trees", metric = "mae")

```


```{r, warning=FALSE,message=FALSE,fig.height=6,fig.width=10}

h2o.scoreHistory(gbm1) %>%
  select(number_of_trees, training_rmse, validation_rmse) %>%
  pivot_longer(cols = -1) %>%
  mutate(a = 1) %>%
  mutate(name=ifelse(name=="training_rmse","Training","Validation")) %>% 
  ggplot(aes(x = number_of_trees,
             y = value,
             col = name)) +
  geom_line(size=1) +
  theme_test(15) +
  xlab("Number of Trees") + ylab("Root Mean Squared Error (RMSE)") +
  scale_color_manual(values = c("blue", "gold2"))+
  ggtitle("Scoring History") +
  theme(legend.position = c(0.90,0.88),
        legend.title = element_blank(),
        plot.title = element_text(hjust = 0.5))


```

# Sumário com as métricas dos modelos

```{r, warning=FALSE,message=FALSE,fig.height=6,fig.width=16}

h2o.performance(gbm1, valid = T)

```

# Aplicando o modelo escolhido no banco de dados de teste

```{r}

predictions<-as.data.frame(h2o.predict(gbm1,test_h2o))
pred <- predictions[,1]

testando <- cbind(test,pred)

```

# Correlação de Pearson

```{r}
cor(testando$prod,testando$pred)

```

# Root Mean Squared Error (RMSE)

```{r}
error_mod <-  testando$prod-testando$pred

# Function for Root Mean Squared Error
RMSE <- function(x) { sqrt(mean(x^2)) }

rmse_mod <- RMSE(error_mod)

rmse_mod
```

# Mean Absolute Error (MAE)

```{r}
# Function for Mean Absolute Error
mae <- function(x) { mean(abs(x)) }
mae_mod <- mae(error_mod)

mae_mod
```

# R2

```{r}
# R2

SQt = sum((mean(testando$prod) - testando$prod)^2)
SQres = sum((testando$pred - testando$prod)^2)
R2 = (SQt - SQres) / SQt
R2

```

# R2.adj

```{r}
# R2.adj

obs <- nrow(testando)
k <- ncol(testando)-1

R2.adj <- 1-((1-R2)*(obs-1)/(obs-k-1))
R2.adj


```

# Histogram of residuals

```{r, warning=FALSE,message=FALSE, fig.width=7, fig.height=5}

a<-gghistostats(data.frame(error_mod),
                x = error_mod,
                xlab = "Residuals",
                title = "Histogram of residuals",
                 type = "p",
                normal.curve = T,
                normal.curve.args = list(size = 1))

a
```

# Gráfico de correlação

```{r, echo=TRUE, warning=FALSE,message=FALSE, fig.width=7, fig.height=5}

b<-ggplot(testando, aes(x = pred, y = prod)) +
  geom_point(alpha=0.1, size=4,col="coral2") +
  geom_smooth(method = "lm", se = T, col="black", linetype=2, size=0.7)+
  stat_cor(method="pearson",  size=5,r.digits = 3, col="red")+
  ylab("Prod")+
  xlab("Predict")+
  theme_bw()+
  geom_rug(col="goldenrod1", alpha=0.2)+
  ggtitle(paste("Relationship between prod and predict","\nn =",obs,"observations"))+
  labs(subtitle = paste("RMSE =",round(rmse_mod,3),"MAE =",round(mae_mod,3),
                        "\nR² =",round(R2,5),"R².adj =",
                        round(R2.adj,5)),
       caption = "H2O Regression Model: Gradient Boosting Machine\n10-fold cross-validation\nDatabase: EAC")

b

```
