Introduction

This Report is for Individual Assignment 1

Data Summary

Setting up working directory

getwd()
## [1] "/Users/markyuhasz/FallWinter 2024 Classes/MKTG3P98-Business analytics and intell/Assignment1"
setwd("/Users/markyuhasz/FallWinter 2024 Classes/MKTG3P98-Business analytics and intell/Assignment1")

Install/Select Packages.

library(readxl)
library(ggplot2)
library(readr) #To read CSV files
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Read the CSV files and get accurate structure of data frame stats.

Car1<-read.csv("Car_Survey_1a.csv")
str(Car1)
## 'data.frame':    1180 obs. of  23 variables:
##  $ Resp        : chr  "Res1" "Res2" "Res3" "Res4" ...
##  $ Att_1       : int  6 7 7 4 6 6 1 6 3 6 ...
##  $ Att_2       : int  6 5 7 1 6 6 1 5 2 6 ...
##  $ Enj_1       : int  6 5 7 1 6 6 1 5 3 4 ...
##  $ Enj_2       : int  6 2 5 1 5 5 1 3 2 4 ...
##  $ Perform_1   : int  5 2 5 1 5 5 2 5 2 4 ...
##  $ Perform_2   : int  6 6 5 1 2 5 2 5 3 4 ...
##  $ Perform_3   : int  3 7 3 1 1 7 2 2 1 1 ...
##  $ WOM_1       : int  3 5 6 7 7 5 2 4 6 5 ...
##  $ WOM_2       : int  3 5 6 7 7 5 3 6 6 6 ...
##  $ Futu_Pur_1  : int  3 6 7 3 7 7 5 4 7 6 ...
##  $ Futu_Pur_2  : int  3 6 7 3 6 7 2 4 7 6 ...
##  $ Valu_Percp_1: int  5 6 5 6 6 7 2 4 6 6 ...
##  $ Valu_Percp_2: int  2 7 7 5 5 7 2 4 6 6 ...
##  $ Pur_Proces_1: int  6 7 7 5 6 7 2 4 6 6 ...
##  $ Pur_Proces_2: int  4 6 7 4 7 7 6 4 6 6 ...
##  $ Residence   : int  2 2 1 2 1 2 2 1 2 1 ...
##  $ Pay_Meth    : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Insur_Type  : chr  "Collision" "Collision" "Collision" "Collision" ...
##  $ Gender      : chr  "Male" "Male" "Male" "Male" ...
##  $ Age         : int  18 18 19 19 19 19 19 21 21 21 ...
##  $ Education   : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ X           : logi  NA NA NA NA NA NA ...
Car2<-read.csv("Car_Survey_2a.csv")
str(Car2)
## 'data.frame':    1049 obs. of  9 variables:
##  $ Respondents: chr  "Res1" "Res2" "Res3" "Res4" ...
##  $ Region     : chr  "European" "European" "European" "European" ...
##  $ Model      : chr  "Ford Expedition" "Ford Expedition" "Ford Expedition" "Ford Expedition" ...
##  $ MPG        : int  15 15 15 15 15 15 15 15 15 15 ...
##  $ Cyl        : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ acc1       : num  5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 ...
##  $ C_cost.    : num  16 16 16 16 16 16 16 16 16 16 ...
##  $ H_Cost     : num  14 14 14 14 14 14 14 14 14 14 ...
##  $ Post.Satis : int  4 3 5 5 5 3 3 6 3 5 ...

Rename Unique ID in Car2 to match Car1

names(Car2)[1]<-c("Resp")
head(Car2, n = 1)
##   Resp   Region           Model MPG Cyl acc1 C_cost. H_Cost Post.Satis
## 1 Res1 European Ford Expedition  15   8  5.5      16     14          4

Merge Car1 and Car2 into one data set

Car_Total <- merge(Car1, Car2, by = "Resp")

Finding numeric columns to remove NA values

numeric_cols <- sapply(Car_Total, is.numeric)

Replacing NA Values with calulated means for all numeric columns

Car_Total[, numeric_cols] <- lapply(Car_Total[, numeric_cols], function(x) {
  mean_val <- mean (x, na.rm = TRUE)
  x[is.na(x)] <- mean_val
  return(x)
})

Verify Replacement

summary(Car_Total)
##      Resp               Att_1           Att_2           Enj_1      
##  Length:1049        Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  Class :character   1st Qu.:4.000   1st Qu.:4.000   1st Qu.:5.000  
##  Mode  :character   Median :5.000   Median :6.000   Median :6.000  
##                     Mean   :4.882   Mean   :5.287   Mean   :5.378  
##                     3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:7.000  
##                     Max.   :7.000   Max.   :7.000   Max.   :7.000  
##      Enj_2         Perform_1       Perform_2       Perform_3    
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:4.000   1st Qu.:4.000   1st Qu.:3.000  
##  Median :5.000   Median :5.000   Median :5.000   Median :5.000  
##  Mean   :4.575   Mean   :4.947   Mean   :4.831   Mean   :4.217  
##  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :7.000  
##      WOM_1           WOM_2        Futu_Pur_1      Futu_Pur_2     Valu_Percp_1  
##  Min.   :1.000   Min.   :1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:4.00   1st Qu.:5.000   1st Qu.:5.000   1st Qu.:5.000  
##  Median :6.000   Median :6.00   Median :6.000   Median :6.000   Median :6.000  
##  Mean   :5.286   Mean   :5.35   Mean   :5.321   Mean   :5.371   Mean   :5.411  
##  3rd Qu.:7.000   3rd Qu.:6.00   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.00   Max.   :9.000   Max.   :7.000   Max.   :7.000  
##   Valu_Percp_2    Pur_Proces_1    Pur_Proces_2     Residence    
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:5.000   1st Qu.:4.000   1st Qu.:1.000  
##  Median :5.000   Median :6.000   Median :5.000   Median :1.000  
##  Mean   :5.114   Mean   :5.256   Mean   :4.923   Mean   :1.474  
##  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:2.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :5.000  
##     Pay_Meth      Insur_Type           Gender               Age       
##  Min.   :1.000   Length:1049        Length:1049        Min.   :18.00  
##  1st Qu.:1.000   Class :character   Class :character   1st Qu.:23.00  
##  Median :2.000   Mode  :character   Mode  :character   Median :34.00  
##  Mean   :2.153                                         Mean   :35.22  
##  3rd Qu.:3.000                                         3rd Qu.:48.00  
##  Max.   :3.000                                         Max.   :60.00  
##    Education        X              Region             Model          
##  Min.   :1.000   Mode:logical   Length:1049        Length:1049       
##  1st Qu.:2.000   NA's:1049      Class :character   Class :character  
##  Median :2.000                  Mode  :character   Mode  :character  
##  Mean   :1.989                                                       
##  3rd Qu.:2.000                                                       
##  Max.   :3.000                                                       
##       MPG             Cyl           acc1          C_cost.          H_Cost      
##  Min.   :14.00   Min.   :4.0   Min.   :3.600   Min.   : 7.00   Min.   : 6.000  
##  1st Qu.:17.00   1st Qu.:4.0   1st Qu.:5.100   1st Qu.:10.00   1st Qu.: 8.000  
##  Median :19.00   Median :6.0   Median :6.500   Median :12.00   Median :10.000  
##  Mean   :19.58   Mean   :5.8   Mean   :6.202   Mean   :11.35   Mean   : 9.634  
##  3rd Qu.:22.00   3rd Qu.:6.0   3rd Qu.:7.500   3rd Qu.:13.00   3rd Qu.:11.000  
##  Max.   :26.00   Max.   :8.0   Max.   :8.500   Max.   :16.00   Max.   :14.000  
##    Post.Satis  
##  Min.   :2.00  
##  1st Qu.:5.00  
##  Median :6.00  
##  Mean   :5.28  
##  3rd Qu.:6.00  
##  Max.   :7.00

Creating a chart to depict number of people based on car models

ggplot(Car_Total, aes(x = Model, fill = Model)) +
  geom_bar() +
  theme_bw() +
  labs(title = " Number of People Owning Different Car Models",
       x = "Car Model",
       y = "Number") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Separate Model of Car from Make of Car.

Car_Total[c('Make', 'Model_v1')] <- str_split_fixed(Car_Total$Model, " ", 2) #slips it by second character

Mutate Data set to indicate Parent Companies of Car Brands

Car_Total <- Car_Total %>% #call dataframe and create new column with new groupings
  mutate(Parent = case_when(Make == "Buick" ~ "General Motors",
                            Make == "Chevrolet" ~ "General Motors",
                            Make == "Chrysler" ~ "Chrysler",
                            Make == "Dodge" ~ "Chrysler",
                            Make == "Fiat" ~ "Chrysler",
                            Make == "Ford" ~ "Ford",
                            Make == "Honda" ~ "Honda",
                            Make == "Kia" ~ "Kia",
                            Make == "Lincoln" ~ "Ford",
                            Make == "Toyota" ~ "Toyota",
                            TRUE ~ "Check"))

Car_Total$Parent<-as.factor(Car_Total$Parent)

Graphing Post Purchase Satisfaction based on Parent Company

p <-ggplot(Car_Total, aes(x=Post.Satis, fill = Parent)) +
  theme_bw() +
  geom_bar(position = 'dodge2') +
       labs(y='Number of Cars',
            x = 'Post Satisfaction',
            title = "Post Purchase Satisfaction by Make")
p + annotate("text", x=3, y=70, label = "1 = Dissatified, 7 = Very Satified")

Make a new column for Future Purchase Intentions

Car_Total$Futmean = (Car_Total$Futu_Pur_1 + Car_Total$Futu_Pur_2) / 2
View(Car_Total[c("Futu_Pur_1", "Futu_Pur_2", "Futmean")])

Filter Based on Toyota, Ford, Honda and Chrysler

Car_Total_Multi <- Car_Total %>%
  filter(Parent %in% c("Ford", "Toyota", "Honda", "Chrysler"))

Visualize Data Based on Parent Company

f <-ggplot(Car_Total_Multi, aes(x = factor(round(Futmean)), fill = Parent)) +
  geom_bar(position = 'dodge') +
  labs(y = 'Owners Rating',
       x = "Future Purchase Intention",
       title = "Number of People that will/likely Purchase the same Car") +
  theme_bw()
f + annotate("text", x=3, y=80, label = "1 = Strongly Disagree, 7 = Strongly Agree")

### Value Perception Contingency Table

Car_Total$Valu_mean <- (Car_Total$Valu_Percp_1 + Car_Total$Valu_Percp_2) / 2
brand_region_table <- aggregate(Valu_mean~Parent + Region, Car_Total, mean)
print(brand_region_table)
##            Parent         Region Valu_mean
## 1        Chrysler       American  5.143597
## 2            Ford       American  5.061728
## 3  General Motors       American  5.333333
## 4           Honda       American  5.026316
## 5             Kia       American  5.666667
## 6          Toyota       American  5.119353
## 7        Chrysler          Asian  4.791667
## 8            Ford          Asian  5.062500
## 9           Honda          Asian  5.422414
## 10         Toyota          Asian  5.127451
## 11       Chrysler       European  5.340909
## 12           Ford       European  5.637931
## 13 General Motors       European  4.854839
## 14          Honda       European  5.413793
## 15            Kia       European  5.421053
## 16         Toyota       European  5.281250
## 17       Chrysler Middle Eastern  5.489691
## 18           Ford Middle Eastern  5.485981
## 19 General Motors Middle Eastern  5.120000
## 20          Honda Middle Eastern  5.455882
## 21         Toyota Middle Eastern  5.750000

Filtering to only show top four companies

Multiple_Valu_mean <- brand_region_table %>%
  filter(Parent == "Honda" | Parent == "Toyota" | Parent == "Ford" | Parent == "Chrysler")
head(Multiple_Valu_mean, n=10)
##      Parent   Region Valu_mean
## 1  Chrysler American  5.143597
## 2      Ford American  5.061728
## 3     Honda American  5.026316
## 4    Toyota American  5.119353
## 5  Chrysler    Asian  4.791667
## 6      Ford    Asian  5.062500
## 7     Honda    Asian  5.422414
## 8    Toyota    Asian  5.127451
## 9  Chrysler European  5.340909
## 10     Ford European  5.637931
table(Multiple_Valu_mean$Parent)
## 
##       Chrysler           Ford General Motors          Honda            Kia 
##              4              4              0              4              0 
##         Toyota 
##              4

Displaying Data Visualization

ggplot(Multiple_Valu_mean, aes(x=Region, y=Valu_mean, group=Parent)) +
  geom_line(aes(color=Parent)) +
  geom_point(aes(color=Parent)) +
  scale_y_continuous(limits = c(4.5,6))

labs(y="Value Mean", 
     title = "Value Mean for Toyota, Honda, Ford and Chrysler by Region")
## $y
## [1] "Value Mean"
## 
## $title
## [1] "Value Mean for Toyota, Honda, Ford and Chrysler by Region"
## 
## attr(,"class")
## [1] "labels"

Create three categorical values for age

Car_Total$AgeGrp<-cut(Car_Total$Age,
                      breaks = c(0, 30, 50, Inf),
                      labels = c("Young adults", "Adults", "Mature Adults"),
                      right = FALSE)
names(Car_Total)
##  [1] "Resp"         "Att_1"        "Att_2"        "Enj_1"        "Enj_2"       
##  [6] "Perform_1"    "Perform_2"    "Perform_3"    "WOM_1"        "WOM_2"       
## [11] "Futu_Pur_1"   "Futu_Pur_2"   "Valu_Percp_1" "Valu_Percp_2" "Pur_Proces_1"
## [16] "Pur_Proces_2" "Residence"    "Pay_Meth"     "Insur_Type"   "Gender"      
## [21] "Age"          "Education"    "X"            "Region"       "Model"       
## [26] "MPG"          "Cyl"          "acc1"         "C_cost."      "H_Cost"      
## [31] "Post.Satis"   "Make"         "Model_v1"     "Parent"       "Futmean"     
## [36] "Valu_mean"    "AgeGrp"
head(Car_Total$AgeGrp)
## [1] Young adults Young adults Adults       Young adults Young adults
## [6] Young adults
## Levels: Young adults Adults Mature Adults

Visualization of age groups and car make purchases

ggplot(Car_Total, aes(x=Parent, fill = AgeGrp)) +
  theme_bw() +
  geom_bar(position = 'dodge') +
  labs(y="Number of People",
       x="Age",
       title = "Age Groups Purchasing Make")