library(rgdal)
## Loading required package: sp
## rgdal: version: 1.1-10, (SVN revision 622)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.0.1, released 2015/09/15
##  Path to GDAL shared files: C:/Users/Aritra/Documents/R/win-library/3.3/rgdal/gdal
##  Loaded PROJ.4 runtime: Rel. 4.9.2, 08 September 2015, [PJ_VERSION: 492]
##  Path to PROJ.4 shared files: C:/Users/Aritra/Documents/R/win-library/3.3/rgdal/proj
##  Linking to sp version: 1.2-3
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(DT)
library(pairsD3)
library(ggplot2)
library(GGally)
library (boot)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:GGally':
## 
##     nasa
## The following objects are masked from 'package:data.table':
## 
##     between, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)   
library(simputation)
library(corrplot)
library(Hmisc)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
## 
##     aml
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library(leaps)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
## 
##     lift
## The following object is masked from 'package:magrittr':
## 
##     set_names
## The following objects are masked from 'package:dplyr':
## 
##     contains, order_by
## The following object is masked from 'package:data.table':
## 
##     transpose
library(Boruta)
## Loading required package: ranger
library(gsubfn)
## Loading required package: proto
library(mice)
## Loading required package: Rcpp
## mice 2.25 2015-11-09
## 
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
## 
##     complete
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:boot':
## 
##     logit
library(leaps)
library(lattice)
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## The following object is masked from 'package:data.table':
## 
##     last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
library(d3heatmap)
library(leaflet)
options(scipen = 999)
setwd("C:\\Users\\Aritra\\Dropbox\\Resolver\\Data")
df<-tbl_df(data.frame(read.csv("Cars.csv",header = T, stringsAsFactors = F)))

dataclean<-function(x){
  x<-gsub("Dollar", "", x);x<-gsub("Euro", "", x);x<-trimws(x);x[x=="."]=NA;x}

replace_dollar_euro<-function(x){
  x<-gsub("[$]","Dollar ",x);x<-gsub("[€]","Euro ",x);x<-trimws(x);x}

#replacing dots with NA
df[df=="."]=NA

#function for flatten correlation and p value 

flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut])}

df1<-df %>% 
  mutate(MSRP=replace_dollar_euro(MSRP),
         Cost.Price=replace_dollar_euro(Cost.Price),
         Invoice=replace_dollar_euro(Invoice)) %>% 
  separate(Cost.Price, c("Currency", "Cost.Price1", "z"), " ", fill = "right",remove = F) %>% 
  select(-Cost.Price1,-z) %>% 
  mutate(MSRP=gsub(",", "", dataclean(MSRP)),
         Cost.Price=gsub(",", "",dataclean(Cost.Price)),
         Invoice=gsub(",", "",dataclean(Invoice))) %>% 
  mutate(MSRP=as.numeric(MSRP),
         Cost.Price=as.numeric(Cost.Price),
         Invoice=as.numeric(Invoice),
         EngineSize=as.numeric(EngineSize),
         Make=as.factor(Make),
         Model=as.factor(Model),
         Type=as.factor(Type),
         Origin=as.factor(Origin),
         DriveTrain=as.factor(DriveTrain),
         Currency=as.factor(Currency),
         MPG_Highway=as.numeric(MPG_Highway))

df<-na.omit(df1)

datatable(md.pattern(df1))
aggr_plot <- aggr(df1, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##                    Variable       Count
##                 MPG_Highway 0.156542056
##                  EngineSize 0.058411215
##                        MSRP 0.037383178
##                   Cylinders 0.004672897
##                        Make 0.000000000
##                       Model 0.000000000
##                        Type 0.000000000
##                      Origin 0.000000000
##                  DriveTrain 0.000000000
##                  Cost.Price 0.000000000
##                    Currency 0.000000000
##                     Invoice 0.000000000
##                  Horsepower 0.000000000
##                    MPG_City 0.000000000
##                      Weight 0.000000000
##                   Wheelbase 0.000000000
##                      Length 0.000000000
##  Time.in.Inventory.in.days. 0.000000000
# panel.cor = function(x, y, digits=2, prefix="", cex.cor, ...){
#   usr = par("usr")
#   on.exit(par(usr))
#   par(usr = c(0,1,0,1))
#   r = abs(cor(x,y,use='complete.obs'))
#   txt=format(c(r,0.123456789),digits=digits)[1]
#   txt=paste(prefix,txt,sep='')
#   if(missing(cex.cor)) cex.cor = 0.8/strwidth(txt)
#   text(0.5, 0.5, txt, cex = cex.cor*(1+r)/2)
# }
# panel.hist = function(x, ...){
#   usr = par('usr')
#   on.exit(par(usr))
#   par(usr = c(usr[1:2], 0, 1.5))
#   h = hist(x, plot=FALSE)
#   breaks = h$breaks
#   nB = length(breaks)
#   y=h$counts
#   y=y/max(y)
#   rect(breaks[-nB], 0, breaks[-1], y, col='white',...)
# }
# panel.lm = function(x, y, col=par('col'), bg = NA, pch=par('pch'),
#                     cex=1, col.smooth='black', ...){
#   points(x,y,pch=pch, col=col, bg=bg, cex=cex)
#   abline(stats::lm(y~x), col=col.smooth,...)
# }
# pairs(df[,-c(1:5,8,10,18)], upper.panel = panel.cor,
#       diag.panel = panel.hist,
#       lower.panel = panel.smooth)
# pairs(df[,-c(1:5,8,10,18)], pch=".", upper.panel = panel.cor,
#       diag.panel = panel.hist, lower.panel = panel.lm,main="T")
# ggcorr(df[,-c(1:5,8,10,18)], palette = "RdBu",label = TRUE)
datatable(df, rownames = FALSE)
chart.Correlation(df[,-c(1:5,8,18)], histogram=TRUE, pch=18)

pairsD3(df[,-c(1:5,8,18)],group=unlist(df[,3]),big=T)
## Input to asJSON(keep_vec_names=TRUE) is a named vector. In a future version of jsonlite, this option will not be supported, and named vectors will be translated into arrays instead of objects. If you want JSON object output, please use a named list instead. See ?toJSON.
pairsD3(df[,-c(1:5,8,18)],group=unlist(df[,4]),big=T)
## Input to asJSON(keep_vec_names=TRUE) is a named vector. In a future version of jsonlite, this option will not be supported, and named vectors will be translated into arrays instead of objects. If you want JSON object output, please use a named list instead. See ?toJSON.
pairsD3(df[,-c(1:5,8,18)],group=unlist(df[,5]),big=T)
## Input to asJSON(keep_vec_names=TRUE) is a named vector. In a future version of jsonlite, this option will not be supported, and named vectors will be translated into arrays instead of objects. If you want JSON object output, please use a named list instead. See ?toJSON.
d3heatmap(df[,-c(1:5,8,18)], scale = "column", colors = "Spectral")
df_cor<-cor(df[,-c(1:5,8,18)])
df_cor
##                   MSRP Cost.Price    Invoice EngineSize  Cylinders
## MSRP         1.0000000  1.0000000  0.9986051  0.5195479  0.4762203
## Cost.Price   1.0000000  1.0000000  0.9986051  0.5195487  0.4762213
## Invoice      0.9986051  0.9986051  1.0000000  0.5231932  0.4786950
## EngineSize   0.5195479  0.5195487  0.5231932  1.0000000  0.9072983
## Cylinders    0.4762203  0.4762213  0.4786950  0.9072983  1.0000000
## Horsepower   0.4363409  0.4363420  0.4406733  0.7857990  0.8058041
## MPG_City    -0.4866326 -0.4866331 -0.4908774 -0.8068176 -0.7659206
## MPG_Highway -0.4334796 -0.4334799 -0.4371530 -0.7615449 -0.7114240
## Weight       0.4267140  0.4267152  0.4266538  0.7963059  0.7365334
## Wheelbase    0.3092619  0.3092630  0.3066052  0.6400867  0.5485896
## Length       0.2690598  0.2690606  0.2685344  0.6377372  0.5488357
##             Horsepower   MPG_City MPG_Highway     Weight  Wheelbase
## MSRP         0.4363409 -0.4866326  -0.4334796  0.4267140  0.3092619
## Cost.Price   0.4363420 -0.4866331  -0.4334799  0.4267152  0.3092630
## Invoice      0.4406733 -0.4908774  -0.4371530  0.4266538  0.3066052
## EngineSize   0.7857990 -0.8068176  -0.7615449  0.7963059  0.6400867
## Cylinders    0.8058041 -0.7659206  -0.7114240  0.7365334  0.5485896
## Horsepower   1.0000000 -0.7427026  -0.6702211  0.6319579  0.3805088
## MPG_City    -0.7427026  1.0000000   0.9414652 -0.8390258 -0.5820273
## MPG_Highway -0.6702211  0.9414652   1.0000000 -0.8309596 -0.5506023
## Weight       0.6319579 -0.8390258  -0.8309596  1.0000000  0.7777583
## Wheelbase    0.3805088 -0.5820273  -0.5506023  0.7777583  1.0000000
## Length       0.3720035 -0.5445035  -0.4619017  0.6992823  0.8913818
##                 Length
## MSRP         0.2690598
## Cost.Price   0.2690606
## Invoice      0.2685344
## EngineSize   0.6377372
## Cylinders    0.5488357
## Horsepower   0.3720035
## MPG_City    -0.5445035
## MPG_Highway -0.4619017
## Weight       0.6992823
## Wheelbase    0.8913818
## Length       1.0000000
d3heatmap(df_cor, scale = "column", colors = "Blues")
df_cor_pval<-rcorr(as.matrix(df[,-c(1:5,8,10,18)]))
df_cor_pval$P
##                                 MSRP               Cost.Price
## MSRP                              NA 0.0000000000000000000000
## Cost.Price  0.0000000000000000000000                       NA
## Invoice     0.0000000000000000000000 0.0000000000000000000000
## Cylinders   0.0000000000000000000000 0.0000000000000000000000
## Horsepower  0.0000000000000000000000 0.0000000000000000000000
## MPG_City    0.0000000000000000000000 0.0000000000000000000000
## MPG_Highway 0.0000000000000002220446 0.0000000000000002220446
## Weight      0.0000000000000008881784 0.0000000000000008881784
## Wheelbase   0.0000000124091847908403 0.0000000124076737773038
## Length      0.0000008525662378300325 0.0000008524996009118269
##                              Invoice Cylinders           Horsepower
## MSRP        0.0000000000000000000000         0 0.000000000000000000
## Cost.Price  0.0000000000000000000000         0 0.000000000000000000
## Invoice                           NA         0 0.000000000000000000
## Cylinders   0.0000000000000000000000        NA 0.000000000000000000
## Horsepower  0.0000000000000000000000         0                   NA
## MPG_City    0.0000000000000000000000         0 0.000000000000000000
## MPG_Highway 0.0000000000000000000000         0 0.000000000000000000
## Weight      0.0000000000000008881784         0 0.000000000000000000
## Wheelbase   0.0000000167518057203608         0 0.000000000001227685
## Length      0.0000008971578357996890         0 0.000000000004182654
##             MPG_City              MPG_Highway                   Weight
## MSRP               0 0.0000000000000002220446 0.0000000000000008881784
## Cost.Price         0 0.0000000000000002220446 0.0000000000000008881784
## Invoice            0 0.0000000000000000000000 0.0000000000000008881784
## Cylinders          0 0.0000000000000000000000 0.0000000000000000000000
## Horsepower         0 0.0000000000000000000000 0.0000000000000000000000
## MPG_City          NA 0.0000000000000000000000 0.0000000000000000000000
## MPG_Highway        0                       NA 0.0000000000000000000000
## Weight             0 0.0000000000000000000000                       NA
## Wheelbase          0 0.0000000000000000000000 0.0000000000000000000000
## Length             0 0.0000000000000000000000 0.0000000000000000000000
##                        Wheelbase               Length
## MSRP        0.000000012409184791 0.000000852566237830
## Cost.Price  0.000000012407673777 0.000000852499600912
## Invoice     0.000000016751805720 0.000000897157835800
## Cylinders   0.000000000000000000 0.000000000000000000
## Horsepower  0.000000000001227685 0.000000000004182654
## MPG_City    0.000000000000000000 0.000000000000000000
## MPG_Highway 0.000000000000000000 0.000000000000000000
## Weight      0.000000000000000000 0.000000000000000000
## Wheelbase                     NA 0.000000000000000000
## Length      0.000000000000000000                   NA
write.csv(df,"corr_df.csv")

df_final_corr<-tbl_df(flattenCorrMatrix(df_cor_pval$r, df_cor_pval$P))

#filter significant correlation 
df_final_corr %>% 
  filter(p<.05)
## # A tibble: 45 × 4
##           row     column       cor     p
##        <fctr>     <fctr>     <dbl> <dbl>
## 1        MSRP Cost.Price 1.0000000     0
## 2        MSRP    Invoice 0.9986051     0
## 3  Cost.Price    Invoice 0.9986051     0
## 4        MSRP  Cylinders 0.4762203     0
## 5  Cost.Price  Cylinders 0.4762213     0
## 6     Invoice  Cylinders 0.4786950     0
## 7        MSRP Horsepower 0.4363409     0
## 8  Cost.Price Horsepower 0.4363420     0
## 9     Invoice Horsepower 0.4406734     0
## 10  Cylinders Horsepower 0.8058041     0
## # ... with 35 more rows
rm(df,df1,df_final_corr)

df<-tbl_df(read.csv("Cars.csv",header = T, stringsAsFactors = F))

# Function to clean the data.

dataclean<-function(x){
  x<-gsub("Dollar", "", x);x<-gsub("Euro", "", x);x<-trimws(x);x[x=="."]=NA;x}

# Function to replace dollar and euro symbols with text

replace_dollar_euro<-function(x){
  x<-gsub("[$]","Dollar ",x);x<-gsub("[€]","Euro ",x);x<-trimws(x);x}


# function for feature selection in msrp 

featureselect<-function(df){
  Boruta(MSRP~.- MtoC_rate -ItoM_rate -MtoI_rate-Cost.Price_new -MSRP_new -Invoice_new -Model -MSRP -Cost.Price -Invoice -Invoice_WP -MSRP_WP -Time.in.Inventory.in.days. -Currency, data = df, doTrace = 2)}

# function to plot important variables (Feature selection)

plot_ImpVar<-function(df){
  plot(df, xlab = "", xaxt = "n")
  lz<-lapply(1:ncol(df$ImpHistory),function(i)
    df$ImpHistory[is.finite(df$ImpHistory[,i]),i])
  names(lz) <- colnames(df$ImpHistory)
  Labels <- sort(sapply(lz,median))
  axis(side = 1,las=2,labels = names(Labels),
       at = 1:ncol(df$ImpHistory), cex.axis = 0.7)}


#Reading data to a tibble

df<-tbl_df(read.csv("Cars.csv",header = T, stringsAsFactors = F))

#replacing dotswith NA

df[df=="."]=NA

#Imputation

df<-df %>% 
  mutate(MSRP=replace_dollar_euro(MSRP),
         Cost.Price=replace_dollar_euro(Cost.Price),
         Invoice=replace_dollar_euro(Invoice)) %>% 
  separate(Cost.Price, c("Currency", "Cost.Price1", "z"), " ", fill = "right",remove = F) %>% 
  select(-Cost.Price1,-z) %>% 
  mutate(MSRP=gsub(",", "", dataclean(MSRP)),
         Cost.Price=gsub(",", "",dataclean(Cost.Price)),
         Invoice=gsub(",", "",dataclean(Invoice))) %>% 
  mutate(MSRP=as.numeric(MSRP),
         Cost.Price=as.numeric(Cost.Price),
         Invoice=as.numeric(Invoice),
         EngineSize=as.factor(EngineSize),
         Make=as.factor(Make),
         Model=as.factor(Model),
         Type=as.factor(Type),
         Origin=as.factor(Origin),
         DriveTrain=as.factor(DriveTrain),
         Currency=as.factor(Currency),
         MPG_Highway=as.numeric(MPG_Highway)) %>% 
  impute_lm(MPG_Highway~MPG_City) %>% 
  impute_lm(Cylinders~Horsepower) %>% 
  impute_rf(EngineSize~Cylinders) %>% 
  impute_lm(MSRP~Cost.Price)

#Euro to Dollar conversion 

df1<-df %>% 
  filter(Currency=="Euro") %>% 
  mutate(MSRP=1.1*MSRP,
         Cost.Price=1.1*Cost.Price,
         Invoice=1.1*Invoice)
df2<-df %>% 
  filter(Currency!="Euro")

df<-rbind(df1,df2)

rm(df2,df1)


# From https://www.census.gov/geo/maps-data/data/cbf/cbf_state.html
continent <- readOGR("continent.shp",
                  layer = "continent", verbose = FALSE)
CONTINENT <- subset(continent, continent$CONTINENT %in% c(
  "Asia","Europe","North America","South America"))
map <- leaflet(CONTINENT)
# p<-data.frame(value=c(0.3691589,0.2873832,0.3434579,0.3434579),names=c("Asia","Europe","North America","South America"))
# p
# 
# qpal <- colorQuantile("Blues", p$value, n =4)
map %>%
  addPolygons(
    stroke = FALSE, fillOpacity = 0.5, smoothFactor = 0.5)