Setup working directory
getwd()
## [1] "C:/Users/LMNig/OneDrive/Documents/School"
setwd("C:/Users/LMNig/OneDrive/Documents/School")
Survey Overview
# Load data file using import dataset
## Import car_survey_1 using "Import Dataset" function
## Import car_survey_2 using "Import Dataset" function
# Load data file using read.csv
## Using read.csv
# Read excel.csv file (save excel file as CSV UTF-8
Car1<-read.csv("C:/Users/LMNig/OneDrive/Documents/School/Car_Survey_1a.csv")
## Display the structure of Car1 (data frame)
str(Car1)
## 'data.frame': 1049 obs. of 22 variables:
## $ Resp : chr "Res1" "Res2" "Res3" "Res4" ...
## $ Att_1 : int 6 7 7 4 6 6 1 6 3 6 ...
## $ Att_2 : int 6 5 7 1 6 6 1 5 2 6 ...
## $ Enj_1 : int 6 5 7 1 6 6 1 5 3 4 ...
## $ Enj_2 : int 6 2 5 1 5 5 1 3 2 4 ...
## $ Perform_1 : int 5 2 5 1 5 5 2 5 2 4 ...
## $ Perform_2 : int 6 6 5 1 2 5 2 5 3 4 ...
## $ Perform_3 : int 3 7 3 1 1 7 2 2 1 1 ...
## $ WOM_1 : int 3 5 6 7 7 5 2 4 6 5 ...
## $ WOM_2 : int 3 5 6 7 7 5 3 6 6 6 ...
## $ Futu_Pur_1 : int 3 6 7 3 7 7 5 4 7 6 ...
## $ Futu_Pur_2 : int 3 6 7 3 6 7 2 4 7 6 ...
## $ Valu_Percp_1: int 5 6 5 6 6 7 2 4 6 6 ...
## $ Valu_Percp_2: int 2 7 7 5 5 7 2 4 6 6 ...
## $ Pur_Proces_1: int 6 7 7 5 6 7 2 4 6 6 ...
## $ Pur_Proces_2: int 4 6 7 4 7 7 6 4 6 6 ...
## $ Residence : int 2 2 1 2 1 2 2 1 2 1 ...
## $ Pay_Meth : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Insur_Type : chr "Collision" "Collision" "Collision" "Collision" ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Age : int 18 18 19 19 19 19 19 21 21 21 ...
## $ Education : int 2 2 2 2 2 2 2 2 2 2 ...
## Display the first few rows of Car1(data frame)
head(Car1, n = 5)
## Resp Att_1 Att_2 Enj_1 Enj_2 Perform_1 Perform_2 Perform_3 WOM_1 WOM_2
## 1 Res1 6 6 6 6 5 6 3 3 3
## 2 Res2 7 5 5 2 2 6 7 5 5
## 3 Res3 7 7 7 5 5 5 3 6 6
## 4 Res4 4 1 1 1 1 1 1 7 7
## 5 Res5 6 6 6 5 5 2 1 7 7
## Futu_Pur_1 Futu_Pur_2 Valu_Percp_1 Valu_Percp_2 Pur_Proces_1 Pur_Proces_2
## 1 3 3 5 2 6 4
## 2 6 6 6 7 7 6
## 3 7 7 5 7 7 7
## 4 3 3 6 5 5 4
## 5 7 6 6 5 6 7
## Residence Pay_Meth Insur_Type Gender Age Education
## 1 2 2 Collision Male 18 2
## 2 2 2 Collision Male 18 2
## 3 1 2 Collision Male 19 2
## 4 2 2 Collision Male 19 2
## 5 1 2 Collision Female 19 2
# Read excel.csv file (save excel file as CSV UTF-8
Car2<-read.csv("C:/Users/LMNig/OneDrive/Documents/School/Car_Survey_2b.csv")
## Display the structure of Car1 (data frame)
str(Car2)
## 'data.frame': 1049 obs. of 9 variables:
## $ Respondents: chr "Res1" "Res2" "Res3" "Res4" ...
## $ Region : chr "European" "European" "European" "European" ...
## $ Model : chr "Ford Expedition" "Ford Expedition" "Ford Expedition" "Ford Expedition" ...
## $ MPG : int 15 15 15 15 15 15 15 15 15 15 ...
## $ Cyl : int 8 8 8 8 8 8 8 8 8 8 ...
## $ acc1 : num 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 ...
## $ C_cost. : num 16 16 16 16 16 16 16 16 16 16 ...
## $ H_Cost : num 14 14 14 14 14 14 14 14 14 14 ...
## $ Post.Satis : int 4 3 5 5 5 3 3 6 3 5 ...
## Display the first few rows of Car1 (data frame)
head(Car2,n=5)
## Respondents Region Model MPG Cyl acc1 C_cost. H_Cost Post.Satis
## 1 Res1 European Ford Expedition 15 8 5.5 16 14 4
## 2 Res2 European Ford Expedition 15 8 5.5 16 14 3
## 3 Res3 European Ford Expedition 15 8 5.5 16 14 5
## 4 Res4 European Ford Expedition 15 8 5.5 16 14 5
## 5 Res5 European Ford Expedition 15 8 5.5 16 14 5
Merging Data
# Create a Master Dataset (combined car_survey1a and car_survey_2b)
# Rename unique ID in Car2 to match with Car1
## Renaming the first column in Car2 as "Resp"
## (to match with the first column name in Car1)
names(Car2)[1]<-c("Resp")
head(Car2, n=1)
## Resp Region Model MPG Cyl acc1 C_cost. H_Cost Post.Satis
## 1 Res1 European Ford Expedition 15 8 5.5 16 14 4
# Merge Car_Survey_1a and Car_Survey_2b into one dataset
Car_Total<-merge(Car1,Car2, by="Resp")
str(Car_Total)
## 'data.frame': 1049 obs. of 30 variables:
## $ Resp : chr "Res1" "Res10" "Res100" "Res1000" ...
## $ Att_1 : int 6 6 6 6 6 3 2 7 2 6 ...
## $ Att_2 : int 6 6 7 6 6 1 2 7 1 6 ...
## $ Enj_1 : int 6 4 7 7 7 4 1 7 2 6 ...
## $ Enj_2 : int 6 4 3 6 6 3 2 6 1 5 ...
## $ Perform_1 : int 5 4 5 6 6 5 2 5 2 5 ...
## $ Perform_2 : int 6 4 6 6 6 6 2 6 2 5 ...
## $ Perform_3 : int 3 1 6 6 6 6 1 5 2 5 ...
## $ WOM_1 : int 3 5 3 6 4 2 6 6 7 3 ...
## $ WOM_2 : int 3 6 5 6 4 6 7 6 7 3 ...
## $ Futu_Pur_1 : int 3 6 6 6 4 6 6 6 7 6 ...
## $ Futu_Pur_2 : int 3 6 6 6 6 6 5 7 7 6 ...
## $ Valu_Percp_1: int 5 6 7 4 5 5 4 6 4 5 ...
## $ Valu_Percp_2: int 2 6 6 6 6 4 4 5 6 6 ...
## $ Pur_Proces_1: int 6 6 5 6 6 5 4 5 6 6 ...
## $ Pur_Proces_2: int 4 6 5 3 7 5 5 5 7 5 ...
## $ Residence : int 2 1 2 2 1 1 1 2 1 2 ...
## $ Pay_Meth : int 2 2 1 3 3 3 3 3 3 3 ...
## $ Insur_Type : chr "Collision" "Collision" "Collision" "Liability" ...
## $ Gender : chr "Male" "Male" "Female" "Female" ...
## $ Age : int 18 21 32 24 24 25 26 26 27 27 ...
## $ Education : int 2 2 1 2 2 2 2 2 2 2 ...
## $ Region : chr "European" "European" "American" "Asian" ...
## $ Model : chr "Ford Expedition" "Ford Expedition" "Toyota Rav4" "Toyota Corolla" ...
## $ MPG : int 15 15 24 26 26 26 26 26 26 26 ...
## $ Cyl : int 8 8 4 4 4 4 4 4 4 4 ...
## $ acc1 : num 5.5 5.5 8.2 8 8 8 8 8 8 8 ...
## $ C_cost. : num 16 16 10 7 7 7 7 7 7 7 ...
## $ H_Cost : num 14 14 8 6 6 6 6 6 6 6 ...
## $ Post.Satis : int 4 5 4 6 5 6 5 6 7 6 ...
head(Car_Total)
## Resp Att_1 Att_2 Enj_1 Enj_2 Perform_1 Perform_2 Perform_3 WOM_1 WOM_2
## 1 Res1 6 6 6 6 5 6 3 3 3
## 2 Res10 6 6 4 4 4 4 1 5 6
## 3 Res100 6 7 7 3 5 6 6 3 5
## 4 Res1000 6 6 7 6 6 6 6 6 6
## 5 Res1001 6 6 7 6 6 6 6 4 4
## 6 Res1002 3 1 4 3 5 6 6 2 6
## Futu_Pur_1 Futu_Pur_2 Valu_Percp_1 Valu_Percp_2 Pur_Proces_1 Pur_Proces_2
## 1 3 3 5 2 6 4
## 2 6 6 6 6 6 6
## 3 6 6 7 6 5 5
## 4 6 6 4 6 6 3
## 5 4 6 5 6 6 7
## 6 6 6 5 4 5 5
## Residence Pay_Meth Insur_Type Gender Age Education Region Model
## 1 2 2 Collision Male 18 2 European Ford Expedition
## 2 1 2 Collision Male 21 2 European Ford Expedition
## 3 2 1 Collision Female 32 1 American Toyota Rav4
## 4 2 3 Liability Female 24 2 Asian Toyota Corolla
## 5 1 3 Liability Female 24 2 Asian Toyota Corolla
## 6 1 3 Liability Female 25 2 Asian Toyota Corolla
## MPG Cyl acc1 C_cost. H_Cost Post.Satis
## 1 15 8 5.5 16 14 4
## 2 15 8 5.5 16 14 5
## 3 24 4 8.2 10 8 4
## 4 26 4 8.0 7 6 6
## 5 26 4 8.0 7 6 5
## 6 26 4 8.0 7 6 6
# Save the merged data (Car_Total) to a file
## save as CSV
write.csv(Car_Total, "Car_Total", row.names=FALSE)
View(Car_Total)
# Using readxl Package
## Load the package
library(readxl)
# Replace "file_path.xlsx" with the path to your file
Car_data_1 <-read_excel("C:/Users/LMNig/OneDrive/Documents/School/Copy of Car_Survey_1.xlsx")
Car_data_2 <-read_excel ("C:/Users/LMNig/OneDrive/Documents/School/Copy of Car_Survey_2.xlsx")
#Summary of Key Analysis Variables
# Create a summary of the selected variables
summary(Car_Total[c("Att_1", "Att_2")])
## Att_1 Att_2
## Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:4.000
## Median :6.000 Median :6.000
## Mean :4.882 Mean :5.287
## 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :7.000 Max. :7.000
## NA's :4
# Create a summary of the selected variables
summary(Car_Total[c("Valu_Percp_1", "Valu_Percp_2")])
## Valu_Percp_1 Valu_Percp_2
## Min. :1.000 Min. :1.000
## 1st Qu.:5.000 1st Qu.:4.000
## Median :6.000 Median :5.000
## Mean :5.411 Mean :5.114
## 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :7.000 Max. :7.000
## NA's :4 NA's :1
# Create a summary of the selected variables
summary(Car_Total[c("WOM_1", "WOM_2")])
## WOM_1 WOM_2
## Min. :1.000 Min. :1.00
## 1st Qu.:4.000 1st Qu.:4.00
## Median :6.000 Median :6.00
## Mean :5.286 Mean :5.35
## 3rd Qu.:7.000 3rd Qu.:6.00
## Max. :7.000 Max. :7.00
## NA's :1 NA's :3
# Create a summary of the selected variables
summary(Car_Total[c("Futu_Pur_1", "Futu_Pur_2")])
## Futu_Pur_1 Futu_Pur_2
## Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:5.000
## Median :6.000 Median :6.000
## Mean :5.321 Mean :5.371
## 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :9.000 Max. :7.000
## NA's :5 NA's :2
# Create a summary of the selected variables
summary(Car_Total[c("Post.Satis")])
## Post.Satis
## Min. :2.00
## 1st Qu.:5.00
## Median :6.00
## Mean :5.28
## 3rd Qu.:6.00
## Max. :7.00
Replacing Na Values
# Count NA values before replacement
na_counts_before <- colSums(is.na(Car_Total))
cat("NA counts before replacement:\n")
## NA counts before replacement:
print(na_counts_before)
## Resp Att_1 Att_2 Enj_1 Enj_2 Perform_1
## 0 4 0 4 4 2
## Perform_2 Perform_3 WOM_1 WOM_2 Futu_Pur_1 Futu_Pur_2
## 4 1 1 3 5 2
## Valu_Percp_1 Valu_Percp_2 Pur_Proces_1 Pur_Proces_2 Residence Pay_Meth
## 4 1 3 4 5 0
## Insur_Type Gender Age Education Region Model
## 0 0 0 0 0 0
## MPG Cyl acc1 C_cost. H_Cost Post.Satis
## 0 0 0 0 0 0
# Identifying numeric columns
numeric_cols <- sapply(Car_Total, is.numeric)
# Replacing NA values with the calculated means for all numeric columns
Car_Total[, numeric_cols] <- lapply(Car_Total[, numeric_cols], function(x) {
mean_val <- mean(x, na.rm = TRUE)
x[is.na(x)] <- mean_val
return(x)
})
#Check for any remaining NA's
any(is.na(Car_Total)) #False=No remaining Na values
## [1] FALSE
Manipulating Data
Creation of “Parent” grouping for each car make and model.
#Group by Car Make
library(stringr) #import library
#Seperate model col into two, delimit using space
Car_Total [c('Make', 'Model_v1')] <- str_split_fixed(Car_Total$Model, " ", 2)
#See the two new columns ("Make", and "Make_v1" in Car_Total data file)
View(Car_Total)
#Group by Parent Company
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Car_Total <- Car_Total %>% #Call dataframe and create new column with new groupings
mutate(Parent = case_when(Make == "Buick" ~ "General Motors",
Make == "Chevrolet" ~ "General Motors",
Make == "Chrysler" ~ "Chrysler",
Make == "Dodge" ~ "Chrysler",
Make == "Fiat" ~ "Chrysler",
Make == "Ford" ~ "Ford",
Make == "Honda" ~ "Honda",
Make == "Kia" ~ "Kia",
Make == "Lincoln" ~ "Ford",
Make == "Toyota" ~ "Toyota",
TRUE ~ "Check"))
#Check if the grouping is correct
count(Car_Total, Car_Total$Make, Car_Total$Parent, name = "Freq")
## Car_Total$Make Car_Total$Parent Freq
## 1 Buick General Motors 31
## 2 Chevrolet General Motors 64
## 3 Chrysler Chrysler 169
## 4 Dodge Chrysler 41
## 5 Fiat Chrysler 18
## 6 Ford Ford 202
## 7 Honda Honda 159
## 8 Kia Kia 34
## 9 Lincoln Ford 39
## 10 Toyota Toyota 292
table(Car_Total$Make)
##
## Buick Chevrolet Chrysler Dodge Fiat Ford Honda Kia
## 31 64 169 41 18 202 159 34
## Lincoln Toyota
## 39 292
Age Categories
# Convert Age to numeric
Car_Total$Age <- as.numeric(as.character(Car_Total$Age))
# Create age categories... 1=(18-29yr), 2=(30-49yr), 3=(50yr+)
Car_Total$Age_Category <- cut(Car_Total$Age,
breaks = c(17, 29, 49, Inf),
labels = c("1", "2", "3"),
right = TRUE)
# Result
table(Car_Total$Age_Category)
##
## 1 2 3
## 443 375 231
Mutate
# Creation of new variables using the average of each
##Average of Att_1 and Att_2 to create new variable "Attitude"
Car_Total$Attitude <- rowMeans(Car_Total[, c("Att_1", "Att_2")], na.rm = TRUE)
# Result
summary(Car_Total$Attitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 5.000 5.085 6.000 7.000
##Average of Futu_Pur_1 and Futu_Pur_2 to create new variable "FPI"
Car_Total$FPI <- rowMeans(Car_Total[, c("Futu_Pur_1", "Futu_Pur_2")], na.rm = TRUE)
# Result
summary(Car_Total$FPI)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.500 6.000 5.346 6.000 7.000
##Average of Valu_Percp_1 and Valu_Percp_2 to create new variable "ValuePercp"
Car_Total$ValuePercp <- rowMeans(Car_Total[, c("Valu_Percp_1", "Valu_Percp_2")], na.rm = TRUE)
# Result
summary(Car_Total$ValuePercp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.500 5.500 5.263 6.000 7.000
##Average of WOM_1 and WOM_2 to create new variable "WOM"
Car_Total$WOM <- rowMeans(Car_Total[, c("WOM_1", "WOM_2")], na.rm = TRUE)
# Result
summary(Car_Total$WOM)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.500 5.500 5.318 6.500 7.000
# Calculate the average of Post.Satis to create new variable "satisfactionmean"
Satisfactionmean <- mean(Car_Total$Post.Satis, na.rm = TRUE)
# Display the result
print(Satisfactionmean)
## [1] 5.280267
Market Segmentation and Target Audience
# Group by Car Make
library(stringr)
# Seperate model col into two, delimit using space
Car_Total [c('Make', 'Model_v1')] <- str_split_fixed(Car_Total$Model, " ", 2)
#See the two new columns ("Make", and "Make_v1" in Car_Total data file)
View(Car_Total)
library(ggplot2)
library(dplyr)
# Age Distribution for Toyota Car Models
## Filter for specific Toyota models
toyota_data <- Car_Total %>%
filter(Model_v1 %in% c("Corolla", "Rav4", "Highlander"))
## Create a bar chart with Model on the x-axis and fill by Age_Category
ggplot(toyota_data, aes(x = Model_v1, fill = Age_Category)) +
geom_bar(position = "dodge") +
labs(title = "Age Distribution by Toyota Model",
x = "Toyota Model",
y = "Count")

# Region Distribution of Toyota car models
## Filter for specific Toyota models
toyota_data <- Car_Total %>%
filter(Model_v1 %in% c("Corolla", "Rav4", "Highlander"))
## Create a bar chart with Model on the x-axis and fill by Region
ggplot(toyota_data, aes(x = Model_v1, fill = Region)) +
geom_bar(position = "dodge") +
labs(title = "Model Region Distribution",
x = "Toyota Model",
y = "Count")

Key Information Need 3: What is the average post-purchase
satisfaction level among young adults regarding their current car
purchases, and how does it vary by Parent company and Region?
## Key Information Need 3: What is the average post-purchase satisfaction level among young adults regarding their current car purchases, and how does it vary by Parent company and Region?
# Calculate the average post-purchase satisfaction for young adults grouped by Parent company
# Calculate the average post-purchase satisfaction for young adults (Age_Category == "1") by Parent company and Region
average_post_satisfaction_young_adults <- Car_Total %>%
filter(Age_Category == "1") %>% # Filter for young adults
group_by(Parent, Region) %>% # Group by Parent company and Region
summarize(
Average_Post_Satisfaction = mean(Post.Satis, na.rm = TRUE))
## `summarise()` has grouped output by 'Parent'. You can override using the
## `.groups` argument.
# View the result
print(average_post_satisfaction_young_adults)
## # A tibble: 17 × 3
## # Groups: Parent [5]
## Parent Region Average_Post_Satisfaction
## <chr> <chr> <dbl>
## 1 Chrysler American 4.96
## 2 Chrysler Asian 4.6
## 3 Chrysler European 5.08
## 4 Chrysler Middle Eastern 5.20
## 5 Ford American 4.06
## 6 Ford European 4.13
## 7 Ford Middle Eastern 5.3
## 8 General Motors American 4.14
## 9 General Motors European 4.8
## 10 General Motors Middle Eastern 5.12
## 11 Honda American 5.04
## 12 Honda Asian 5.44
## 13 Honda European 5.68
## 14 Honda Middle Eastern 5.78
## 15 Toyota American 5.5
## 16 Toyota Asian 5.79
## 17 Toyota European 5.55
ggplot(average_post_satisfaction_young_adults, aes(x = Parent, y = Average_Post_Satisfaction, color = Region)) +
geom_point(size = 4) +
geom_line(aes(group = Region), size = 1) +
labs(title = "Average Post-Purchase Satisfaction Among Young Adults by Parent and Region",
x = "Parent Company",
y = "Average Post-Purchase Satisfaction")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
