library(rgdal)
## Loading required package: sp
## rgdal: version: 1.1-10, (SVN revision 622)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.0.1, released 2015/09/15
## Path to GDAL shared files: C:/Users/Aritra/Documents/R/win-library/3.3/rgdal/gdal
## Loaded PROJ.4 runtime: Rel. 4.9.2, 08 September 2015, [PJ_VERSION: 492]
## Path to PROJ.4 shared files: C:/Users/Aritra/Documents/R/win-library/3.3/rgdal/proj
## Linking to sp version: 1.2-3
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(DT)
library(pairsD3)
library(ggplot2)
library(GGally)
library (boot)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:data.table':
##
## between, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
library(simputation)
library(corrplot)
library(Hmisc)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
##
## aml
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(leaps)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
##
## lift
## The following object is masked from 'package:magrittr':
##
## set_names
## The following objects are masked from 'package:dplyr':
##
## contains, order_by
## The following object is masked from 'package:data.table':
##
## transpose
library(Boruta)
## Loading required package: ranger
library(gsubfn)
## Loading required package: proto
library(mice)
## Loading required package: Rcpp
## mice 2.25 2015-11-09
##
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
##
## complete
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:boot':
##
## logit
library(leaps)
library(lattice)
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## The following object is masked from 'package:data.table':
##
## last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
library(d3heatmap)
library(leaflet)
options(scipen = 999)
setwd("C:\\Users\\Aritra\\Dropbox\\Resolver\\Data")
df<-tbl_df(data.frame(read.csv("Cars.csv",header = T, stringsAsFactors = F)))
dataclean<-function(x){
x<-gsub("Dollar", "", x);x<-gsub("Euro", "", x);x<-trimws(x);x[x=="."]=NA;x}
replace_dollar_euro<-function(x){
x<-gsub("[$]","Dollar ",x);x<-gsub("[€]","Euro ",x);x<-trimws(x);x}
#replacing dots with NA
df[df=="."]=NA
#function for flatten correlation and p value
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut])}
df1<-df %>%
mutate(MSRP=replace_dollar_euro(MSRP),
Cost.Price=replace_dollar_euro(Cost.Price),
Invoice=replace_dollar_euro(Invoice)) %>%
separate(Cost.Price, c("Currency", "Cost.Price1", "z"), " ", fill = "right",remove = F) %>%
select(-Cost.Price1,-z) %>%
mutate(MSRP=gsub(",", "", dataclean(MSRP)),
Cost.Price=gsub(",", "",dataclean(Cost.Price)),
Invoice=gsub(",", "",dataclean(Invoice))) %>%
mutate(MSRP=as.numeric(MSRP),
Cost.Price=as.numeric(Cost.Price),
Invoice=as.numeric(Invoice),
EngineSize=as.numeric(EngineSize),
Make=as.factor(Make),
Model=as.factor(Model),
Type=as.factor(Type),
Origin=as.factor(Origin),
DriveTrain=as.factor(DriveTrain),
Currency=as.factor(Currency),
MPG_Highway=as.numeric(MPG_Highway))
df<-na.omit(df1)
datatable(md.pattern(df1))
aggr_plot <- aggr(df1, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))

##
## Variables sorted by number of missings:
## Variable Count
## MPG_Highway 0.156542056
## EngineSize 0.058411215
## MSRP 0.037383178
## Cylinders 0.004672897
## Make 0.000000000
## Model 0.000000000
## Type 0.000000000
## Origin 0.000000000
## DriveTrain 0.000000000
## Cost.Price 0.000000000
## Currency 0.000000000
## Invoice 0.000000000
## Horsepower 0.000000000
## MPG_City 0.000000000
## Weight 0.000000000
## Wheelbase 0.000000000
## Length 0.000000000
## Time.in.Inventory.in.days. 0.000000000
# panel.cor = function(x, y, digits=2, prefix="", cex.cor, ...){
# usr = par("usr")
# on.exit(par(usr))
# par(usr = c(0,1,0,1))
# r = abs(cor(x,y,use='complete.obs'))
# txt=format(c(r,0.123456789),digits=digits)[1]
# txt=paste(prefix,txt,sep='')
# if(missing(cex.cor)) cex.cor = 0.8/strwidth(txt)
# text(0.5, 0.5, txt, cex = cex.cor*(1+r)/2)
# }
# panel.hist = function(x, ...){
# usr = par('usr')
# on.exit(par(usr))
# par(usr = c(usr[1:2], 0, 1.5))
# h = hist(x, plot=FALSE)
# breaks = h$breaks
# nB = length(breaks)
# y=h$counts
# y=y/max(y)
# rect(breaks[-nB], 0, breaks[-1], y, col='white',...)
# }
# panel.lm = function(x, y, col=par('col'), bg = NA, pch=par('pch'),
# cex=1, col.smooth='black', ...){
# points(x,y,pch=pch, col=col, bg=bg, cex=cex)
# abline(stats::lm(y~x), col=col.smooth,...)
# }
# pairs(df[,-c(1:5,8,10,18)], upper.panel = panel.cor,
# diag.panel = panel.hist,
# lower.panel = panel.smooth)
# pairs(df[,-c(1:5,8,10,18)], pch=".", upper.panel = panel.cor,
# diag.panel = panel.hist, lower.panel = panel.lm,main="T")
# ggcorr(df[,-c(1:5,8,10,18)], palette = "RdBu",label = TRUE)
datatable(df, rownames = FALSE)
chart.Correlation(df[,-c(1:5,8,18)], histogram=TRUE, pch=18)

pairsD3(df[,-c(1:5,8,18)],group=unlist(df[,3]),big=T)
## Input to asJSON(keep_vec_names=TRUE) is a named vector. In a future version of jsonlite, this option will not be supported, and named vectors will be translated into arrays instead of objects. If you want JSON object output, please use a named list instead. See ?toJSON.
pairsD3(df[,-c(1:5,8,18)],group=unlist(df[,4]),big=T)
## Input to asJSON(keep_vec_names=TRUE) is a named vector. In a future version of jsonlite, this option will not be supported, and named vectors will be translated into arrays instead of objects. If you want JSON object output, please use a named list instead. See ?toJSON.
pairsD3(df[,-c(1:5,8,18)],group=unlist(df[,5]),big=T)
## Input to asJSON(keep_vec_names=TRUE) is a named vector. In a future version of jsonlite, this option will not be supported, and named vectors will be translated into arrays instead of objects. If you want JSON object output, please use a named list instead. See ?toJSON.
d3heatmap(df[,-c(1:5,8,18)], scale = "column", colors = "Spectral")
df_cor<-cor(df[,-c(1:5,8,18)])
df_cor
## MSRP Cost.Price Invoice EngineSize Cylinders
## MSRP 1.0000000 1.0000000 0.9986051 0.5195479 0.4762203
## Cost.Price 1.0000000 1.0000000 0.9986051 0.5195487 0.4762213
## Invoice 0.9986051 0.9986051 1.0000000 0.5231932 0.4786950
## EngineSize 0.5195479 0.5195487 0.5231932 1.0000000 0.9072983
## Cylinders 0.4762203 0.4762213 0.4786950 0.9072983 1.0000000
## Horsepower 0.4363409 0.4363420 0.4406733 0.7857990 0.8058041
## MPG_City -0.4866326 -0.4866331 -0.4908774 -0.8068176 -0.7659206
## MPG_Highway -0.4334796 -0.4334799 -0.4371530 -0.7615449 -0.7114240
## Weight 0.4267140 0.4267152 0.4266538 0.7963059 0.7365334
## Wheelbase 0.3092619 0.3092630 0.3066052 0.6400867 0.5485896
## Length 0.2690598 0.2690606 0.2685344 0.6377372 0.5488357
## Horsepower MPG_City MPG_Highway Weight Wheelbase
## MSRP 0.4363409 -0.4866326 -0.4334796 0.4267140 0.3092619
## Cost.Price 0.4363420 -0.4866331 -0.4334799 0.4267152 0.3092630
## Invoice 0.4406733 -0.4908774 -0.4371530 0.4266538 0.3066052
## EngineSize 0.7857990 -0.8068176 -0.7615449 0.7963059 0.6400867
## Cylinders 0.8058041 -0.7659206 -0.7114240 0.7365334 0.5485896
## Horsepower 1.0000000 -0.7427026 -0.6702211 0.6319579 0.3805088
## MPG_City -0.7427026 1.0000000 0.9414652 -0.8390258 -0.5820273
## MPG_Highway -0.6702211 0.9414652 1.0000000 -0.8309596 -0.5506023
## Weight 0.6319579 -0.8390258 -0.8309596 1.0000000 0.7777583
## Wheelbase 0.3805088 -0.5820273 -0.5506023 0.7777583 1.0000000
## Length 0.3720035 -0.5445035 -0.4619017 0.6992823 0.8913818
## Length
## MSRP 0.2690598
## Cost.Price 0.2690606
## Invoice 0.2685344
## EngineSize 0.6377372
## Cylinders 0.5488357
## Horsepower 0.3720035
## MPG_City -0.5445035
## MPG_Highway -0.4619017
## Weight 0.6992823
## Wheelbase 0.8913818
## Length 1.0000000
d3heatmap(df_cor, scale = "column", colors = "Blues")
df_cor_pval<-rcorr(as.matrix(df[,-c(1:5,8,10,18)]))
df_cor_pval$P
## MSRP Cost.Price
## MSRP NA 0.0000000000000000000000
## Cost.Price 0.0000000000000000000000 NA
## Invoice 0.0000000000000000000000 0.0000000000000000000000
## Cylinders 0.0000000000000000000000 0.0000000000000000000000
## Horsepower 0.0000000000000000000000 0.0000000000000000000000
## MPG_City 0.0000000000000000000000 0.0000000000000000000000
## MPG_Highway 0.0000000000000002220446 0.0000000000000002220446
## Weight 0.0000000000000008881784 0.0000000000000008881784
## Wheelbase 0.0000000124091847908403 0.0000000124076737773038
## Length 0.0000008525662378300325 0.0000008524996009118269
## Invoice Cylinders Horsepower
## MSRP 0.0000000000000000000000 0 0.000000000000000000
## Cost.Price 0.0000000000000000000000 0 0.000000000000000000
## Invoice NA 0 0.000000000000000000
## Cylinders 0.0000000000000000000000 NA 0.000000000000000000
## Horsepower 0.0000000000000000000000 0 NA
## MPG_City 0.0000000000000000000000 0 0.000000000000000000
## MPG_Highway 0.0000000000000000000000 0 0.000000000000000000
## Weight 0.0000000000000008881784 0 0.000000000000000000
## Wheelbase 0.0000000167518057203608 0 0.000000000001227685
## Length 0.0000008971578357996890 0 0.000000000004182654
## MPG_City MPG_Highway Weight
## MSRP 0 0.0000000000000002220446 0.0000000000000008881784
## Cost.Price 0 0.0000000000000002220446 0.0000000000000008881784
## Invoice 0 0.0000000000000000000000 0.0000000000000008881784
## Cylinders 0 0.0000000000000000000000 0.0000000000000000000000
## Horsepower 0 0.0000000000000000000000 0.0000000000000000000000
## MPG_City NA 0.0000000000000000000000 0.0000000000000000000000
## MPG_Highway 0 NA 0.0000000000000000000000
## Weight 0 0.0000000000000000000000 NA
## Wheelbase 0 0.0000000000000000000000 0.0000000000000000000000
## Length 0 0.0000000000000000000000 0.0000000000000000000000
## Wheelbase Length
## MSRP 0.000000012409184791 0.000000852566237830
## Cost.Price 0.000000012407673777 0.000000852499600912
## Invoice 0.000000016751805720 0.000000897157835800
## Cylinders 0.000000000000000000 0.000000000000000000
## Horsepower 0.000000000001227685 0.000000000004182654
## MPG_City 0.000000000000000000 0.000000000000000000
## MPG_Highway 0.000000000000000000 0.000000000000000000
## Weight 0.000000000000000000 0.000000000000000000
## Wheelbase NA 0.000000000000000000
## Length 0.000000000000000000 NA
write.csv(df,"corr_df.csv")
df_final_corr<-tbl_df(flattenCorrMatrix(df_cor_pval$r, df_cor_pval$P))
#filter significant correlation
df_final_corr %>%
filter(p<.05)
## # A tibble: 45 × 4
## row column cor p
## <fctr> <fctr> <dbl> <dbl>
## 1 MSRP Cost.Price 1.0000000 0
## 2 MSRP Invoice 0.9986051 0
## 3 Cost.Price Invoice 0.9986051 0
## 4 MSRP Cylinders 0.4762203 0
## 5 Cost.Price Cylinders 0.4762213 0
## 6 Invoice Cylinders 0.4786950 0
## 7 MSRP Horsepower 0.4363409 0
## 8 Cost.Price Horsepower 0.4363420 0
## 9 Invoice Horsepower 0.4406734 0
## 10 Cylinders Horsepower 0.8058041 0
## # ... with 35 more rows
rm(df,df1,df_final_corr)
df<-tbl_df(read.csv("Cars.csv",header = T, stringsAsFactors = F))
# Function to clean the data.
dataclean<-function(x){
x<-gsub("Dollar", "", x);x<-gsub("Euro", "", x);x<-trimws(x);x[x=="."]=NA;x}
# Function to replace dollar and euro symbols with text
replace_dollar_euro<-function(x){
x<-gsub("[$]","Dollar ",x);x<-gsub("[€]","Euro ",x);x<-trimws(x);x}
# function for feature selection in msrp
featureselect<-function(df){
Boruta(MSRP~.- MtoC_rate -ItoM_rate -MtoI_rate-Cost.Price_new -MSRP_new -Invoice_new -Model -MSRP -Cost.Price -Invoice -Invoice_WP -MSRP_WP -Time.in.Inventory.in.days. -Currency, data = df, doTrace = 2)}
# function to plot important variables (Feature selection)
plot_ImpVar<-function(df){
plot(df, xlab = "", xaxt = "n")
lz<-lapply(1:ncol(df$ImpHistory),function(i)
df$ImpHistory[is.finite(df$ImpHistory[,i]),i])
names(lz) <- colnames(df$ImpHistory)
Labels <- sort(sapply(lz,median))
axis(side = 1,las=2,labels = names(Labels),
at = 1:ncol(df$ImpHistory), cex.axis = 0.7)}
#Reading data to a tibble
df<-tbl_df(read.csv("Cars.csv",header = T, stringsAsFactors = F))
#replacing dotswith NA
df[df=="."]=NA
#Imputation
df<-df %>%
mutate(MSRP=replace_dollar_euro(MSRP),
Cost.Price=replace_dollar_euro(Cost.Price),
Invoice=replace_dollar_euro(Invoice)) %>%
separate(Cost.Price, c("Currency", "Cost.Price1", "z"), " ", fill = "right",remove = F) %>%
select(-Cost.Price1,-z) %>%
mutate(MSRP=gsub(",", "", dataclean(MSRP)),
Cost.Price=gsub(",", "",dataclean(Cost.Price)),
Invoice=gsub(",", "",dataclean(Invoice))) %>%
mutate(MSRP=as.numeric(MSRP),
Cost.Price=as.numeric(Cost.Price),
Invoice=as.numeric(Invoice),
EngineSize=as.factor(EngineSize),
Make=as.factor(Make),
Model=as.factor(Model),
Type=as.factor(Type),
Origin=as.factor(Origin),
DriveTrain=as.factor(DriveTrain),
Currency=as.factor(Currency),
MPG_Highway=as.numeric(MPG_Highway)) %>%
impute_lm(MPG_Highway~MPG_City) %>%
impute_lm(Cylinders~Horsepower) %>%
impute_rf(EngineSize~Cylinders) %>%
impute_lm(MSRP~Cost.Price)
#Euro to Dollar conversion
df1<-df %>%
filter(Currency=="Euro") %>%
mutate(MSRP=1.1*MSRP,
Cost.Price=1.1*Cost.Price,
Invoice=1.1*Invoice)
df2<-df %>%
filter(Currency!="Euro")
df<-rbind(df1,df2)
rm(df2,df1)
# From https://www.census.gov/geo/maps-data/data/cbf/cbf_state.html
continent <- readOGR("continent.shp",
layer = "continent", verbose = FALSE)
CONTINENT <- subset(continent, continent$CONTINENT %in% c(
"Asia","Europe","North America","South America"))
map <- leaflet(CONTINENT)
# p<-data.frame(value=c(0.3691589,0.2873832,0.3434579,0.3434579),names=c("Asia","Europe","North America","South America"))
# p
#
# qpal <- colorQuantile("Blues", p$value, n =4)
map %>%
addPolygons(
stroke = FALSE, fillOpacity = 0.5, smoothFactor = 0.5)