R Markdown

This is the Analysis of the data of the vehicles which was taken in the USA and it is for the class exercises purposes only.

#importing the libraries
library(psych)
library(DAAG)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
#importing the dataset
mydta <- read.csv("C:/Users/Abdul Qudoos/Desktop/vehicleMiss.csv", header = T)
#seeing the basic properties
str(mydta)
## 'data.frame':    1624 obs. of  7 variables:
##  $ vehicle: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ fm     : int  0 10 15 0 13 21 11 5 8 1 ...
##  $ Mileage: int  863 4644 16330 13 22537 40931 34762 11051 7003 11 ...
##  $ lh     : num  1.1 2.4 4.2 1 4.5 3.1 0.7 2.9 3.4 0.7 ...
##  $ lc     : num  66.3 233 325.1 66.6 328.7 ...
##  $ mc     : num  697 120 175 0 175 ...
##  $ State  : chr  "MS" "CA" "WI" "OR" ...
summary(mydta)
##     vehicle             fm            Mileage            lh        
##  Min.   :   1.0   Min.   :-1.000   Min.   :    1   Min.   : 0.000  
##  1st Qu.: 406.8   1st Qu.: 4.000   1st Qu.: 5778   1st Qu.: 1.500  
##  Median : 812.5   Median :10.000   Median :17000   Median : 2.600  
##  Mean   : 812.5   Mean   : 9.414   Mean   :20559   Mean   : 3.294  
##  3rd Qu.:1218.2   3rd Qu.:14.000   3rd Qu.:30061   3rd Qu.: 4.300  
##  Max.   :1624.0   Max.   :23.000   Max.   :99983   Max.   :35.200  
##                                    NA's   :13      NA's   :6       
##        lc               mc            State          
##  Min.   :   0.0   Min.   :   0.0   Length:1624       
##  1st Qu.: 106.5   1st Qu.: 119.7   Class :character  
##  Median : 195.4   Median : 119.7   Mode  :character  
##  Mean   : 242.8   Mean   : 179.4                     
##  3rd Qu.: 317.8   3rd Qu.: 175.5                     
##  Max.   :3234.4   Max.   :3891.1                     
##  NA's   :8
sum(is.na(mydta)) #to see the toal number of missing values
## [1] 42
md.pattern(mydta) #to see the heatmap and pattern of the missing data, ## red one means missingg

##      vehicle fm mc lh lc Mileage State   
## 1586       1  1  1  1  1       1     1  0
## 11         1  1  1  1  1       1     0  1
## 13         1  1  1  1  1       0     1  1
## 6          1  1  1  1  0       1     1  1
## 2          1  1  1  1  0       1     0  2
## 4          1  1  1  0  1       1     1  1
## 2          1  1  1  0  1       1     0  2
##            0  0  0  6  8      13    15 42

Imputation #as we have seen the missing values now its the time to impute the values and for that we will use the mice library, and also that the states here are the strings variables so we will convert that variable into the factor variable

dta <- as.data.frame(unclass(mydta), stringsAsFactors = T)
str(dta)
## 'data.frame':    1624 obs. of  7 variables:
##  $ vehicle: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ fm     : int  0 10 15 0 13 21 11 5 8 1 ...
##  $ Mileage: int  863 4644 16330 13 22537 40931 34762 11051 7003 11 ...
##  $ lh     : num  1.1 2.4 4.2 1 4.5 3.1 0.7 2.9 3.4 0.7 ...
##  $ lc     : num  66.3 233 325.1 66.6 328.7 ...
##  $ mc     : num  697 120 175 0 175 ...
##  $ State  : Factor w/ 50 levels "AK","AL","AR",..: 25 5 48 37 4 9 18 10 47 38 ...
summary(dta)
##     vehicle             fm            Mileage            lh        
##  Min.   :   1.0   Min.   :-1.000   Min.   :    1   Min.   : 0.000  
##  1st Qu.: 406.8   1st Qu.: 4.000   1st Qu.: 5778   1st Qu.: 1.500  
##  Median : 812.5   Median :10.000   Median :17000   Median : 2.600  
##  Mean   : 812.5   Mean   : 9.414   Mean   :20559   Mean   : 3.294  
##  3rd Qu.:1218.2   3rd Qu.:14.000   3rd Qu.:30061   3rd Qu.: 4.300  
##  Max.   :1624.0   Max.   :23.000   Max.   :99983   Max.   :35.200  
##                                    NA's   :13      NA's   :6       
##        lc               mc             State    
##  Min.   :   0.0   Min.   :   0.0   TX     :290  
##  1st Qu.: 106.5   1st Qu.: 119.7   CA     :199  
##  Median : 195.4   Median : 119.7   FL     :167  
##  Mean   : 242.8   Mean   : 179.4   GA     : 75  
##  3rd Qu.: 317.8   3rd Qu.: 175.5   AZ     : 61  
##  Max.   :3234.4   Max.   :3891.1   (Other):817  
##  NA's   :8                         NA's   : 15
impute <- mice(dta[,2:7], m = 3, seed = 234) #relevant data columns ([=])these are the certain subsets of the data
## 
##  iter imp variable
##   1   1  Mileage  lh  lc  State
##   1   2  Mileage  lh  lc  State
##   1   3  Mileage  lh  lc  State
##   2   1  Mileage  lh  lc  State
##   2   2  Mileage  lh  lc  State
##   2   3  Mileage  lh  lc  State
##   3   1  Mileage  lh  lc  State
##   3   2  Mileage  lh  lc  State
##   3   3  Mileage  lh  lc  State
##   4   1  Mileage  lh  lc  State
##   4   2  Mileage  lh  lc  State
##   4   3  Mileage  lh  lc  State
##   5   1  Mileage  lh  lc  State
##   5   2  Mileage  lh  lc  State
##   5   3  Mileage  lh  lc  State
print(impute)
## Class: mids
## Number of multiple imputations:  3 
## Imputation methods:
##        fm   Mileage        lh        lc        mc     State 
##        ""     "pmm"     "pmm"     "pmm"        "" "polyreg" 
## PredictorMatrix:
##         fm Mileage lh lc mc State
## fm       0       1  1  1  1     1
## Mileage  1       0  1  1  1     1
## lh       1       1  0  1  1     1
## lc       1       1  1  0  1     1
## mc       1       1  1  1  0     1
## State    1       1  1  1  1     0
#now for the checking purpose like to which imputation to take i am seeing just the three rows to see which is one more better to apply
mydta[20,]
##    vehicle fm Mileage  lh    lc   mc State
## 20      20  8      NA 1.4 87.42 1.85    NH
mydta[253,]
##     vehicle fm Mileage  lh    lc     mc State
## 253     253  1      NA 1.4 89.89 119.66    FL
mydta[1570,]
##      vehicle fm Mileage  lh    lc mc State
## 1570    1570  0      NA 0.7 44.43  0    PA
#here i can see that the mileage is a important variable here, here the imputation number one mostly justifies that so  i ma using the imputation no 1 to impute the missing values
#now completing the datasset by imputation no 1 imputation
newdta <- complete(impute,1)
summary(newdta)
##        fm            Mileage            lh               lc        
##  Min.   :-1.000   Min.   :    1   Min.   : 0.000   Min.   :   0.0  
##  1st Qu.: 4.000   1st Qu.: 5691   1st Qu.: 1.500   1st Qu.: 106.5  
##  Median :10.000   Median :16994   Median : 2.600   Median : 195.6  
##  Mean   : 9.414   Mean   :20502   Mean   : 3.309   Mean   : 242.8  
##  3rd Qu.:14.000   3rd Qu.:29997   3rd Qu.: 4.300   3rd Qu.: 317.8  
##  Max.   :23.000   Max.   :99983   Max.   :35.200   Max.   :3234.4  
##                                                                    
##        mc             State    
##  Min.   :   0.0   TX     :291  
##  1st Qu.: 119.7   CA     :199  
##  Median : 119.7   FL     :167  
##  Mean   : 179.4   GA     : 75  
##  3rd Qu.: 175.5   AZ     : 64  
##  Max.   :3891.1   LA     : 49  
##                   (Other):779
sum(is.na(newdta))
## [1] 0

Visulizations

Now as the data is complete now so now we will do some visualizations,

#creating the graphs, scatter, histograms and relationships
pairs.panels(mydta[,2:6], cex.cor = 0.5)

hist(mydta$Mileage , col = "green", breaks = 11, 
     main = "Histogram for Mileage", xlab = "Mileage of the vehicle")

plot(mydta$lh ~ mydta$lc, xlab = "Labour cost", ylab = "Labour Hours", main = "Scatter plot between labour cost and its the hours" )

#histogrmas
mydta %>%
  filter(State == "CA" | State == "FL" | State == "TX") %>%
  ggplot(aes(Mileage, fill = State)) + geom_histogram(color = "black") + ggtitle('Mileage for the vehicles') + facet_wrap(vars(State))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).

#Multipple scatterplot generating
mydta %>%
  filter(State == "CA" | State == "FL" | State == "TX") %>%
  ggplot(aes(fm,Mileage, col = State, size = Mileage)) + geom_point() + geom_smooth(se=0) + ggtitle('Fm (Months vs Mileages in the top States') + facet_wrap(vars(State))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation: size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: Removed 4 rows containing missing values (`geom_point()`).

#Barplot generating
barplot_data <- mydta %>%
  group_by(State)


ggplot(barplot_data, aes(State, Mileage , fill= State)) + 
  geom_col() + coord_flip()+ ggtitle("BarPLot for Mileage for States")
## Warning: Removed 13 rows containing missing values (`position_stack()`).