DATA IMPORT & DATA WRANGLING

Install the packages for data import/export, wrangling and visualisation

#install.packages('tidyverse') #comes with dplyr, readr, ggplot2, tidyr etc
#install.packages('readxl') #reads excel files
#install.packages('writexl') # writes excel files
#install.packages('lubridate') # to help parse and manipulate dates

Loading required packages and the data

#load the packages
library (tidyverse)
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library (writexl)
## Warning: package 'writexl' was built under R version 4.4.3
library (lubridate)

#load the data
data(mtcars)
attach(mtcars)
## The following object is masked from package:ggplot2:
## 
##     mpg
View(mtcars)

#Take a quick look of the data
glimpse(mtcars)
## Rows: 32
## Columns: 11
## $ mpg  <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8,…
## $ cyl  <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 8,…
## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.8, 16…
## $ hp   <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180, 180…
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,…
## $ wt   <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.150, 3.…
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.90, 18…
## $ vs   <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,…
## $ am   <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,…
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3,…
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2,…
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
tail(mtcars)
##                 mpg cyl  disp  hp drat    wt qsec vs am gear carb
## Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.7  0  1    5    2
## Lotus Europa   30.4   4  95.1 113 3.77 1.513 16.9  1  1    5    2
## Ford Pantera L 15.8   8 351.0 264 4.22 3.170 14.5  0  1    5    4
## Ferrari Dino   19.7   6 145.0 175 3.62 2.770 15.5  0  1    5    6
## Maserati Bora  15.0   8 301.0 335 3.54 3.570 14.6  0  1    5    8
## Volvo 142E     21.4   4 121.0 109 4.11 2.780 18.6  1  1    4    2
dim(mtcars) #the data has 32 rows and 11 columns
## [1] 32 11
names(mtcars)
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

Data cleaning

cars_clean<-mtcars%>%
  rownames_to_column(var="Model") %>%  #make car names a column
  mutate(
    cyl=as.factor(cyl), #convert to a categorical variable
    am=factor(am,labels=c("Automatic",'Manual')), 
    vs=factor(vs,labels=c("V-shaped","Straight")),
    wt_kg=453.6*wt      #convert weight from lbs to kgs
  )

head(cars_clean)
##               Model  mpg cyl disp  hp drat    wt  qsec       vs        am gear
## 1         Mazda RX4 21.0   6  160 110 3.90 2.620 16.46 V-shaped    Manual    4
## 2     Mazda RX4 Wag 21.0   6  160 110 3.90 2.875 17.02 V-shaped    Manual    4
## 3        Datsun 710 22.8   4  108  93 3.85 2.320 18.61 Straight    Manual    4
## 4    Hornet 4 Drive 21.4   6  258 110 3.08 3.215 19.44 Straight Automatic    3
## 5 Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02 V-shaped Automatic    3
## 6           Valiant 18.1   6  225 105 2.76 3.460 20.22 Straight Automatic    3
##   carb    wt_kg
## 1    4 1188.432
## 2    4 1304.100
## 3    1 1052.352
## 4    1 1458.324
## 5    2 1560.384
## 6    1 1569.456

SUMMARY STATISTICS

summary(cars_clean)
##     Model                mpg        cyl         disp             hp       
##  Length:32          Min.   :10.40   4:11   Min.   : 71.1   Min.   : 52.0  
##  Class :character   1st Qu.:15.43   6: 7   1st Qu.:120.8   1st Qu.: 96.5  
##  Mode  :character   Median :19.20   8:14   Median :196.3   Median :123.0  
##                     Mean   :20.09          Mean   :230.7   Mean   :146.7  
##                     3rd Qu.:22.80          3rd Qu.:326.0   3rd Qu.:180.0  
##                     Max.   :33.90          Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec              vs             am    
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   V-shaped:18   Automatic:19  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   Straight:14   Manual   :13  
##  Median :3.695   Median :3.325   Median :17.71                               
##  Mean   :3.597   Mean   :3.217   Mean   :17.85                               
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90                               
##  Max.   :4.930   Max.   :5.424   Max.   :22.90                               
##       gear            carb           wt_kg       
##  Min.   :3.000   Min.   :1.000   Min.   : 686.3  
##  1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1170.9  
##  Median :4.000   Median :2.000   Median :1508.2  
##  Mean   :3.688   Mean   :2.812   Mean   :1459.3  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:1637.5  
##  Max.   :5.000   Max.   :8.000   Max.   :2460.3
cars_clean %>%
  summarise(
    avg_mpg=mean(mpg),
    avg_hp=mean(hp),
    avg_wt=mean(wt_kg),
  )  #run summaries for mpg, hp and wt in kg
##    avg_mpg   avg_hp   avg_wt
## 1 20.09062 146.6875 1459.345
cars_clean%>%
  group_by(am)%>%
  summarise(
    mean_mpg=mean(mpg),
    mean_hp=mean(hp),
    n=n()
  ) #run summaries and group by transmission
## # A tibble: 2 × 4
##   am        mean_mpg mean_hp     n
##   <fct>        <dbl>   <dbl> <int>
## 1 Automatic     17.1    160.    19
## 2 Manual        24.4    127.    13
cars_clean %>%
  group_by(vs)%>%
  summarise(
    mean_hp1=mean(hp),
    mean_wt=mean(wt_kg),
    mean_mpg=mean(mpg),
    n=n()
  ) #summarise and group by Engine type
## # A tibble: 2 × 5
##   vs       mean_hp1 mean_wt mean_mpg     n
##   <fct>       <dbl>   <dbl>    <dbl> <int>
## 1 V-shaped    190.    1673.     16.6    18
## 2 Straight     91.4   1184.     24.6    14

VISUALIZATION

#barchart
cars_clean %>%
  group_by(cyl)%>%
  summarise(mean_mpg=mean(mpg)) %>%
  ggplot(aes(x=cyl,y=mean_mpg,fill=cyl))+
  geom_col()+
    labs(title="Average fuel Efficiency by Cylinders",y="Mean MPG",x="Number of Cylinders")+ theme_minimal()

#box plot

ggplot(cars_clean, aes(x=am, y=mpg, fill=am)) + geom_boxplot() +
  labs(title="Fuel Efficiency by Transmission type", x= "Transmission type", y="Miles per Galon") + 
  theme_minimal()

#scatter plot

ggplot(cars_clean, aes(x=hp, y=mpg, color=am, size =wt_kg)) +
  geom_point(alpha=0.7) + 
  labs(title= "Horsepower vs. Fuel Efficiency", 
       x="Horsepower (hp)", y= "Miles per gallon (mpg)",color="Transmission", size="Weight (kg)") +
         theme_minimal()

#trend plot

ggplot(cars_clean, aes(x = wt_kg, y = mpg)) +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "Fuel Efficiency vs Weight",
       x = "Weight (kg)", y = "MPG") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'