This is the Analysis of the data of the vehicles which was taken in the USA and it is for the class exercises purposes only.
#importing the libraries
library(psych)
library(DAAG)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
#importing the dataset
mydta <- read.csv("C:/Users/Abdul Qudoos/Desktop/vehicleMiss.csv", header = T)
#seeing the basic properties
str(mydta)
## 'data.frame': 1624 obs. of 7 variables:
## $ vehicle: int 1 2 3 4 5 6 7 8 9 10 ...
## $ fm : int 0 10 15 0 13 21 11 5 8 1 ...
## $ Mileage: int 863 4644 16330 13 22537 40931 34762 11051 7003 11 ...
## $ lh : num 1.1 2.4 4.2 1 4.5 3.1 0.7 2.9 3.4 0.7 ...
## $ lc : num 66.3 233 325.1 66.6 328.7 ...
## $ mc : num 697 120 175 0 175 ...
## $ State : chr "MS" "CA" "WI" "OR" ...
summary(mydta)
## vehicle fm Mileage lh
## Min. : 1.0 Min. :-1.000 Min. : 1 Min. : 0.000
## 1st Qu.: 406.8 1st Qu.: 4.000 1st Qu.: 5778 1st Qu.: 1.500
## Median : 812.5 Median :10.000 Median :17000 Median : 2.600
## Mean : 812.5 Mean : 9.414 Mean :20559 Mean : 3.294
## 3rd Qu.:1218.2 3rd Qu.:14.000 3rd Qu.:30061 3rd Qu.: 4.300
## Max. :1624.0 Max. :23.000 Max. :99983 Max. :35.200
## NA's :13 NA's :6
## lc mc State
## Min. : 0.0 Min. : 0.0 Length:1624
## 1st Qu.: 106.5 1st Qu.: 119.7 Class :character
## Median : 195.4 Median : 119.7 Mode :character
## Mean : 242.8 Mean : 179.4
## 3rd Qu.: 317.8 3rd Qu.: 175.5
## Max. :3234.4 Max. :3891.1
## NA's :8
sum(is.na(mydta)) #to see the toal number of missing values
## [1] 42
md.pattern(mydta) #to see the heatmap and pattern of the missing data, ## red one means missingg
## vehicle fm mc lh lc Mileage State
## 1586 1 1 1 1 1 1 1 0
## 11 1 1 1 1 1 1 0 1
## 13 1 1 1 1 1 0 1 1
## 6 1 1 1 1 0 1 1 1
## 2 1 1 1 1 0 1 0 2
## 4 1 1 1 0 1 1 1 1
## 2 1 1 1 0 1 1 0 2
## 0 0 0 6 8 13 15 42
Imputation #as we have seen the missing values now its the time to impute the values and for that we will use the mice library, and also that the states here are the strings variables so we will convert that variable into the factor variable
dta <- as.data.frame(unclass(mydta), stringsAsFactors = T)
str(dta)
## 'data.frame': 1624 obs. of 7 variables:
## $ vehicle: int 1 2 3 4 5 6 7 8 9 10 ...
## $ fm : int 0 10 15 0 13 21 11 5 8 1 ...
## $ Mileage: int 863 4644 16330 13 22537 40931 34762 11051 7003 11 ...
## $ lh : num 1.1 2.4 4.2 1 4.5 3.1 0.7 2.9 3.4 0.7 ...
## $ lc : num 66.3 233 325.1 66.6 328.7 ...
## $ mc : num 697 120 175 0 175 ...
## $ State : Factor w/ 50 levels "AK","AL","AR",..: 25 5 48 37 4 9 18 10 47 38 ...
summary(dta)
## vehicle fm Mileage lh
## Min. : 1.0 Min. :-1.000 Min. : 1 Min. : 0.000
## 1st Qu.: 406.8 1st Qu.: 4.000 1st Qu.: 5778 1st Qu.: 1.500
## Median : 812.5 Median :10.000 Median :17000 Median : 2.600
## Mean : 812.5 Mean : 9.414 Mean :20559 Mean : 3.294
## 3rd Qu.:1218.2 3rd Qu.:14.000 3rd Qu.:30061 3rd Qu.: 4.300
## Max. :1624.0 Max. :23.000 Max. :99983 Max. :35.200
## NA's :13 NA's :6
## lc mc State
## Min. : 0.0 Min. : 0.0 TX :290
## 1st Qu.: 106.5 1st Qu.: 119.7 CA :199
## Median : 195.4 Median : 119.7 FL :167
## Mean : 242.8 Mean : 179.4 GA : 75
## 3rd Qu.: 317.8 3rd Qu.: 175.5 AZ : 61
## Max. :3234.4 Max. :3891.1 (Other):817
## NA's :8 NA's : 15
impute <- mice(dta[,2:7], m = 3, seed = 234) #relevant data columns ([=])these are the certain subsets of the data
##
## iter imp variable
## 1 1 Mileage lh lc State
## 1 2 Mileage lh lc State
## 1 3 Mileage lh lc State
## 2 1 Mileage lh lc State
## 2 2 Mileage lh lc State
## 2 3 Mileage lh lc State
## 3 1 Mileage lh lc State
## 3 2 Mileage lh lc State
## 3 3 Mileage lh lc State
## 4 1 Mileage lh lc State
## 4 2 Mileage lh lc State
## 4 3 Mileage lh lc State
## 5 1 Mileage lh lc State
## 5 2 Mileage lh lc State
## 5 3 Mileage lh lc State
print(impute)
## Class: mids
## Number of multiple imputations: 3
## Imputation methods:
## fm Mileage lh lc mc State
## "" "pmm" "pmm" "pmm" "" "polyreg"
## PredictorMatrix:
## fm Mileage lh lc mc State
## fm 0 1 1 1 1 1
## Mileage 1 0 1 1 1 1
## lh 1 1 0 1 1 1
## lc 1 1 1 0 1 1
## mc 1 1 1 1 0 1
## State 1 1 1 1 1 0
#now for the checking purpose like to which imputation to take i am seeing just the three rows to see which is one more better to apply
mydta[20,]
## vehicle fm Mileage lh lc mc State
## 20 20 8 NA 1.4 87.42 1.85 NH
mydta[253,]
## vehicle fm Mileage lh lc mc State
## 253 253 1 NA 1.4 89.89 119.66 FL
mydta[1570,]
## vehicle fm Mileage lh lc mc State
## 1570 1570 0 NA 0.7 44.43 0 PA
#here i can see that the mileage is a important variable here, here the imputation number one mostly justifies that so i ma using the imputation no 1 to impute the missing values
#now completing the datasset by imputation no 1 imputation
newdta <- complete(impute,1)
summary(newdta)
## fm Mileage lh lc
## Min. :-1.000 Min. : 1 Min. : 0.000 Min. : 0.0
## 1st Qu.: 4.000 1st Qu.: 5691 1st Qu.: 1.500 1st Qu.: 106.5
## Median :10.000 Median :16994 Median : 2.600 Median : 195.6
## Mean : 9.414 Mean :20502 Mean : 3.309 Mean : 242.8
## 3rd Qu.:14.000 3rd Qu.:29997 3rd Qu.: 4.300 3rd Qu.: 317.8
## Max. :23.000 Max. :99983 Max. :35.200 Max. :3234.4
##
## mc State
## Min. : 0.0 TX :291
## 1st Qu.: 119.7 CA :199
## Median : 119.7 FL :167
## Mean : 179.4 GA : 75
## 3rd Qu.: 175.5 AZ : 64
## Max. :3891.1 LA : 49
## (Other):779
sum(is.na(newdta))
## [1] 0
Now as the data is complete now so now we will do some visualizations,
#creating the graphs, scatter, histograms and relationships
pairs.panels(mydta[,2:6], cex.cor = 0.5)
hist(mydta$Mileage , col = "green", breaks = 11,
main = "Histogram for Mileage", xlab = "Mileage of the vehicle")
plot(mydta$lh ~ mydta$lc, xlab = "Labour cost", ylab = "Labour Hours", main = "Scatter plot between labour cost and its the hours" )
#histogrmas
mydta %>%
filter(State == "CA" | State == "FL" | State == "TX") %>%
ggplot(aes(Mileage, fill = State)) + geom_histogram(color = "black") + ggtitle('Mileage for the vehicles') + facet_wrap(vars(State))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).
#Multipple scatterplot generating
mydta %>%
filter(State == "CA" | State == "FL" | State == "TX") %>%
ggplot(aes(fm,Mileage, col = State, size = Mileage)) + geom_point() + geom_smooth(se=0) + ggtitle('Fm (Months vs Mileages in the top States') + facet_wrap(vars(State))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation: size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 4 rows containing missing values (`geom_point()`).
#Barplot generating
barplot_data <- mydta %>%
group_by(State)
ggplot(barplot_data, aes(State, Mileage , fill= State)) +
geom_col() + coord_flip()+ ggtitle("BarPLot for Mileage for States")
## Warning: Removed 13 rows containing missing values (`position_stack()`).