We store the data that we downloaded into an object called data.
data <- read.csv2("BelkinElagoComplete.csv", header = T) # importing the file from my Wd
head(data)
## salary age elevel car zipcode credit brand
## 1 123476.6 23 4 1 0 779.56 Elago
## 2 120274.6 22 4 1 2 784.70 Elago
## 3 121735.5 26 4 1 1 749.35 Elago
## 4 138276.8 23 4 1 0 743.85 Elago
## 5 126869.2 23 4 1 0 759.03 Elago
## 6 130595.1 22 4 1 7 774.13 Elago
tail(data)
## salary age elevel car zipcode credit brand
## 9995 93055.21 45 3 3 8 585.84 Elago
## 9996 82700.89 52 3 3 6 682.34 Elago
## 9997 87488.02 29 2 12 8 597.31 Elago
## 9998 90905.84 58 2 19 2 708.59 Elago
## 9999 91315.24 47 3 16 4 703.44 Elago
## 10000 80023.00 31 2 3 5 679.04 Elago
summary(data)
## salary age elevel car
## Min. : 20000 Min. :20.00 Min. :1.000 Min. : 1.00
## 1st Qu.: 52109 1st Qu.:35.00 1st Qu.:2.000 1st Qu.: 5.00
## Median : 84969 Median :50.00 Median :2.000 Median :10.00
## Mean : 84897 Mean :49.81 Mean :2.339 Mean :10.47
## 3rd Qu.:117168 3rd Qu.:65.00 3rd Qu.:3.000 3rd Qu.:16.00
## Max. :150000 Max. :80.00 Max. :4.000 Max. :20.00
## zipcode credit brand
## Min. :0.000 Min. :416.6 Belkin:4652
## 1st Qu.:2.000 1st Qu.:563.4 Elago :5348
## Median :4.000 Median :632.1
## Mean :4.037 Mean :632.1
## 3rd Qu.:6.000 3rd Qu.:701.3
## Max. :8.000 Max. :849.0
names(data)
## [1] "salary" "age" "elevel" "car" "zipcode" "credit" "brand"
library(ggplot2)
summary(data$salary) # Summary of the Elevel dataset
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20000 52109 84969 84897 117168 150000
class(data$salary)
## [1] "numeric"
summary(data$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.00 35.00 50.00 49.81 65.00 80.00
class(data$age) # Catagorical Data
## [1] "integer"
str(data$age) #
## int [1:10000] 23 22 26 23 23 22 80 71 75 59 ...
summary(data$brand)
## Belkin Elago
## 4652 5348
str(data$brand)
## Factor w/ 2 levels "Belkin","Elago": 2 2 2 2 2 2 1 1 1 1 ...
summary(data$credit) #
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 416.6 563.4 632.1 632.1 701.3 849.0
str(data$credit)
## num [1:10000] 780 785 749 744 759 ...
Exporing ggplot2
# breaks=c( 0,20000,40000, 60000, 80000, 100000,120000, 150000)
#labels=c("A", "B", "C", "D", "E", "F", "G", "H")
summary(data$salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20000 52109 84969 84897 117168 150000
data$catSalary <- cut(data$salary,
breaks=c(0,40000, 60000, 80000, 100000,120000, 150000),
labels = c("A", "B", "C", "D", "E", "F"))
summary(data$catSalary)
## A B C D E F
## 1563 1505 1527 1574 1530 2301
Elevel_data <- data$elevel<- as.factor(data$elevel) # Converting catogorical data into a factor
Zipcode_data <- data$zipCode<- as.factor(data$zipcode) # Converting catogorical data into a factor
Car_data <-data$car <- as.factor(data$car) #
GG1 <- ggplot(data = data, aes(x = data$car, y= data$elevel, col= data$brand))
GG2 <-GG1 + geom_point()
GG3 <- GG2 + stat_smooth() + theme_bw() + facet_grid()
print(GG3)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
############ Histogram for Numerical Variables ###############
Salary_his <- ggplot(data = data,aes(x=data$salary))
Salary_his + geom_histogram(binwidth = 10000, aes( fill =..count..)) + ggtitle("Salary Data frame Distribution")
Credit_his <- ggplot(data = data, aes(x = data$credit)) + geom_histogram(binwidth = 30, aes(fill = ..count..)) + ggtitle ("Credit data frame distribution")
Age_his <- ggplot(data = data, aes(x = data$age)) + geom_histogram(binwidth = 10, colour= "blue", fill = "#ff0076") + ggtitle("Age data frame distribution")
library(ggplot2)
print(Salary_his)
print(Age_his)
print(Credit_his)
BAR CHART FOR CATAGORICAL VARIABLE
Elevel_bar <- ggplot(data = data, aes(x = elevel)) + geom_bar(colour = "blue", fill = "pink") + theme_bw()
Car_bar <- ggplot(data = data, aes(x = car)) + geom_bar(fill = "#ff0076")
Zipcode_bar <- ggplot(data = data, aes(x = zipCode)) + geom_bar(fill ="#ff0076")
Brand_bar <- ggplot(data = data, aes(x = brand)) + geom_bar(fill = "#ff0076", colour = "black")
print(Elevel_bar)
print(Car_bar)
print(Zipcode_bar)
print(Brand_bar)
############## RELATIONSHIP BLW VARIABLES USING GGPLOT $ GEOM_BOXPLOT ################
Brand_age<- ggplot(data = data, aes(x = brand, y = age)) + geom_boxplot(colour = "blue")
print(Brand_age) #
Elevel_Sala_car <- ggplot(data = data, aes(x = elevel, y = salary, col = car)) + geom_boxplot()
print(Elevel_Sala_car + ggtitle('Relationship B/W Elevel, Salary and Car'))
Age_Sala_Age <-ggplot(data = data, aes(x = age, y = salary, col = elevel)) + geom_boxplot()
print(Age_Sala_Age + ggtitle('Relationship B/W Elevel, Salary and Car'))
Elevel_Credit <- ggplot(data = data, aes(x = elevel, y = credit)) + geom_boxplot()
print(Elevel_Credit + ggtitle('Relationship B/W Elevel, Salary and Car'))
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.9.3 :)
## Examples and tutorials at livebook.datascienceheroes.com
## / Now in Spanish: librovivodecienciadedatos.ai
correlation_table(data = data, target = 'elevel') # Salary and Credit are positively correlated with Education level)
## Variable elevel
## 1 elevel 1.00
## 2 salary 0.71
## 3 credit 0.62
## 4 age 0.01
## 5 zipcode 0.01
correlation_table(data = data, target = "brand") # Salary, Credit and Age are negatively correlated with Brand.
## Variable brand
## 1 brand 1.00
## 2 zipcode 0.01
## 3 salary -0.02
## 4 credit -0.02
## 5 age -0.35
cross_plot(data, input = "elevel", target = "brand") # Plot shows how input variable relationship with the target variable
cross_plot(data, input = "age", target = "brand")
## Plotting transformed variable 'age' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'
library(funModeling)
CREATING TEST AND TRAINING MODEL
set.seed(125)
trainSize <- round(nrow(data) * 0.7)
testSize <- nrow(data) - trainSize
trainSize
## [1] 7000
testSize
## [1] 3000
training_Model <- sample(seq_len(nrow(data)), size = trainSize)
trainSet <- data[training_Model,]
testSet <- data[-training_Model,]
LM_brand <- lm(age ~ brand, trainSet)
print(LM_brand)
##
## Call:
## lm(formula = age ~ brand, data = trainSet)
##
## Coefficients:
## (Intercept) brandElago
## 56.55 -12.39
LM_Elevel <- lm(brand~elevel, trainSet)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
print(LM_Elevel)
##
## Call:
## lm(formula = brand ~ elevel, data = trainSet)
##
## Coefficients:
## (Intercept) elevel2 elevel3 elevel4
## 1.63320 -0.12193 -0.14107 0.04004