General exploration

We store the data that we downloaded into an object called data.

data <- read.csv2("BelkinElagoComplete.csv", header = T)  # importing the file from my Wd
head(data)
##     salary age elevel car zipcode credit brand
## 1 123476.6  23      4   1       0 779.56 Elago
## 2 120274.6  22      4   1       2 784.70 Elago
## 3 121735.5  26      4   1       1 749.35 Elago
## 4 138276.8  23      4   1       0 743.85 Elago
## 5 126869.2  23      4   1       0 759.03 Elago
## 6 130595.1  22      4   1       7 774.13 Elago
tail(data)
##         salary age elevel car zipcode credit brand
## 9995  93055.21  45      3   3       8 585.84 Elago
## 9996  82700.89  52      3   3       6 682.34 Elago
## 9997  87488.02  29      2  12       8 597.31 Elago
## 9998  90905.84  58      2  19       2 708.59 Elago
## 9999  91315.24  47      3  16       4 703.44 Elago
## 10000 80023.00  31      2   3       5 679.04 Elago
summary(data)
##      salary            age            elevel           car       
##  Min.   : 20000   Min.   :20.00   Min.   :1.000   Min.   : 1.00  
##  1st Qu.: 52109   1st Qu.:35.00   1st Qu.:2.000   1st Qu.: 5.00  
##  Median : 84969   Median :50.00   Median :2.000   Median :10.00  
##  Mean   : 84897   Mean   :49.81   Mean   :2.339   Mean   :10.47  
##  3rd Qu.:117168   3rd Qu.:65.00   3rd Qu.:3.000   3rd Qu.:16.00  
##  Max.   :150000   Max.   :80.00   Max.   :4.000   Max.   :20.00  
##     zipcode          credit         brand     
##  Min.   :0.000   Min.   :416.6   Belkin:4652  
##  1st Qu.:2.000   1st Qu.:563.4   Elago :5348  
##  Median :4.000   Median :632.1                
##  Mean   :4.037   Mean   :632.1                
##  3rd Qu.:6.000   3rd Qu.:701.3                
##  Max.   :8.000   Max.   :849.0
names(data)
## [1] "salary"  "age"     "elevel"  "car"     "zipcode" "credit"  "brand"
library(ggplot2)

Salary Exploration

summary(data$salary) # Summary of the Elevel dataset
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20000   52109   84969   84897  117168  150000
class(data$salary)
## [1] "numeric"

Exploration of the age

summary(data$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   35.00   50.00   49.81   65.00   80.00
class(data$age)                     # Catagorical Data
## [1] "integer"
str(data$age)                        # 
##  int [1:10000] 23 22 26 23 23 22 80 71 75 59 ...

Exploration of the brand

summary(data$brand)                   
## Belkin  Elago 
##   4652   5348
str(data$brand)
##  Factor w/ 2 levels "Belkin","Elago": 2 2 2 2 2 2 1 1 1 1 ...

Exploration of the credit

summary(data$credit)        # 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   416.6   563.4   632.1   632.1   701.3   849.0
str(data$credit)
##  num [1:10000] 780 785 749 744 759 ...

Exporing ggplot2

# breaks=c( 0,20000,40000, 60000, 80000, 100000,120000, 150000)
#labels=c("A", "B", "C", "D", "E", "F", "G", "H")
summary(data$salary)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20000   52109   84969   84897  117168  150000
data$catSalary <- cut(data$salary, 
                      breaks=c(0,40000, 60000, 80000, 100000,120000, 150000),
                      labels = c("A", "B", "C", "D", "E", "F"))
summary(data$catSalary)
##    A    B    C    D    E    F 
## 1563 1505 1527 1574 1530 2301
Elevel_data <- data$elevel<- as.factor(data$elevel)   # Converting catogorical data into a factor

Zipcode_data <- data$zipCode<- as.factor(data$zipcode)  # Converting catogorical data into a factor

Car_data <-data$car <- as.factor(data$car)        # 
GG1 <- ggplot(data = data, aes(x = data$car, y= data$elevel, col= data$brand)) 


GG2 <-GG1 + geom_point()


GG3 <- GG2 + stat_smooth() + theme_bw() + facet_grid()

print(GG3)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

############ Histogram for Numerical Variables ###############

Salary_his <- ggplot(data = data,aes(x=data$salary)) 

Salary_his + geom_histogram(binwidth = 10000, aes( fill =..count..)) + ggtitle("Salary Data frame Distribution")

Credit_his <- ggplot(data = data, aes(x = data$credit)) + geom_histogram(binwidth = 30, aes(fill = ..count..)) + ggtitle ("Credit data frame distribution")


Age_his <- ggplot(data = data, aes(x = data$age)) + geom_histogram(binwidth = 10, colour= "blue", fill = "#ff0076") + ggtitle("Age data frame distribution")

library(ggplot2)
print(Salary_his)

print(Age_his)

print(Credit_his)

BAR CHART FOR CATAGORICAL VARIABLE

Elevel_bar <- ggplot(data = data, aes(x = elevel)) + geom_bar(colour = "blue", fill = "pink") + theme_bw()

Car_bar <- ggplot(data = data, aes(x = car)) + geom_bar(fill = "#ff0076")

Zipcode_bar <- ggplot(data = data, aes(x = zipCode)) + geom_bar(fill ="#ff0076")

Brand_bar <- ggplot(data = data, aes(x = brand)) + geom_bar(fill = "#ff0076", colour = "black")

print(Elevel_bar)

print(Car_bar)

print(Zipcode_bar)

print(Brand_bar)

############## RELATIONSHIP BLW VARIABLES USING GGPLOT $ GEOM_BOXPLOT ################

Brand_age<- ggplot(data = data, aes(x = brand, y = age)) + geom_boxplot(colour = "blue")

print(Brand_age)  # 

Elevel_Sala_car <- ggplot(data = data, aes(x = elevel, y = salary, col = car)) + geom_boxplot()

print(Elevel_Sala_car + ggtitle('Relationship B/W Elevel, Salary and Car'))

Age_Sala_Age <-ggplot(data = data, aes(x = age, y = salary, col = elevel)) + geom_boxplot()

print(Age_Sala_Age + ggtitle('Relationship B/W Elevel, Salary and Car'))     

Elevel_Credit <- ggplot(data = data, aes(x = elevel, y = credit)) + geom_boxplot()
print(Elevel_Credit + ggtitle('Relationship B/W Elevel, Salary and Car'))

library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## funModeling v.1.9.3 :)
## Examples and tutorials at livebook.datascienceheroes.com
##  / Now in Spanish: librovivodecienciadedatos.ai
correlation_table(data = data, target = 'elevel')  # Salary and Credit are positively correlated with Education level)
##   Variable elevel
## 1   elevel   1.00
## 2   salary   0.71
## 3   credit   0.62
## 4      age   0.01
## 5  zipcode   0.01
correlation_table(data = data, target = "brand")  #  Salary, Credit and Age are negatively correlated with Brand.
##   Variable brand
## 1    brand  1.00
## 2  zipcode  0.01
## 3   salary -0.02
## 4   credit -0.02
## 5      age -0.35
cross_plot(data, input = "elevel", target = "brand") # Plot shows how input variable relationship                                                         with the target variable

cross_plot(data, input = "age", target = "brand")
## Plotting transformed variable 'age' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

library(funModeling)

CREATING TEST AND TRAINING MODEL

set.seed(125)

trainSize <- round(nrow(data) * 0.7)
testSize <- nrow(data) - trainSize
trainSize
## [1] 7000
testSize
## [1] 3000
training_Model <- sample(seq_len(nrow(data)), size = trainSize)
trainSet <- data[training_Model,]
testSet <- data[-training_Model,]

LM_brand <- lm(age ~ brand, trainSet)
print(LM_brand)
## 
## Call:
## lm(formula = age ~ brand, data = trainSet)
## 
## Coefficients:
## (Intercept)   brandElago  
##       56.55       -12.39
LM_Elevel <- lm(brand~elevel, trainSet)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
print(LM_Elevel)
## 
## Call:
## lm(formula = brand ~ elevel, data = trainSet)
## 
## Coefficients:
## (Intercept)      elevel2      elevel3      elevel4  
##     1.63320     -0.12193     -0.14107      0.04004