getwd()
## [1] "C:/Users/LMNig/OneDrive/Documents/School"
setwd("C:/Users/LMNig/OneDrive/Documents/School")
options(repos = c(CRAN = "https://cloud.r-project.org"))
# Load data file using import dataset
## Import car_survey_1 using "Import Dataset" function
## Import car_survey_2 using "Import Dataset" function
# Load data file using read.csv
## Using read.csv
# Read excel.csv file (save excel file as CSV UTF-8
Car1<-read.csv("C:/Users/LMNig/OneDrive/Documents/School/Car_Survey_1a.csv")
## Display the structure of Car1 (data frame)
str(Car1)
## 'data.frame': 1049 obs. of 22 variables:
## $ Resp : chr "Res1" "Res2" "Res3" "Res4" ...
## $ Att_1 : int 6 7 7 4 6 6 1 6 3 6 ...
## $ Att_2 : int 6 5 7 1 6 6 1 5 2 6 ...
## $ Enj_1 : int 6 5 7 1 6 6 1 5 3 4 ...
## $ Enj_2 : int 6 2 5 1 5 5 1 3 2 4 ...
## $ Perform_1 : int 5 2 5 1 5 5 2 5 2 4 ...
## $ Perform_2 : int 6 6 5 1 2 5 2 5 3 4 ...
## $ Perform_3 : int 3 7 3 1 1 7 2 2 1 1 ...
## $ WOM_1 : int 3 5 6 7 7 5 2 4 6 5 ...
## $ WOM_2 : int 3 5 6 7 7 5 3 6 6 6 ...
## $ Futu_Pur_1 : int 3 6 7 3 7 7 5 4 7 6 ...
## $ Futu_Pur_2 : int 3 6 7 3 6 7 2 4 7 6 ...
## $ Valu_Percp_1: int 5 6 5 6 6 7 2 4 6 6 ...
## $ Valu_Percp_2: int 2 7 7 5 5 7 2 4 6 6 ...
## $ Pur_Proces_1: int 6 7 7 5 6 7 2 4 6 6 ...
## $ Pur_Proces_2: int 4 6 7 4 7 7 6 4 6 6 ...
## $ Residence : int 2 2 1 2 1 2 2 1 2 1 ...
## $ Pay_Meth : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Insur_Type : chr "Collision" "Collision" "Collision" "Collision" ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Age : int 18 18 19 19 19 19 19 21 21 21 ...
## $ Education : int 2 2 2 2 2 2 2 2 2 2 ...
## Display the first few rows of Car1(data frame)
head(Car1, n = 5)
## Resp Att_1 Att_2 Enj_1 Enj_2 Perform_1 Perform_2 Perform_3 WOM_1 WOM_2
## 1 Res1 6 6 6 6 5 6 3 3 3
## 2 Res2 7 5 5 2 2 6 7 5 5
## 3 Res3 7 7 7 5 5 5 3 6 6
## 4 Res4 4 1 1 1 1 1 1 7 7
## 5 Res5 6 6 6 5 5 2 1 7 7
## Futu_Pur_1 Futu_Pur_2 Valu_Percp_1 Valu_Percp_2 Pur_Proces_1 Pur_Proces_2
## 1 3 3 5 2 6 4
## 2 6 6 6 7 7 6
## 3 7 7 5 7 7 7
## 4 3 3 6 5 5 4
## 5 7 6 6 5 6 7
## Residence Pay_Meth Insur_Type Gender Age Education
## 1 2 2 Collision Male 18 2
## 2 2 2 Collision Male 18 2
## 3 1 2 Collision Male 19 2
## 4 2 2 Collision Male 19 2
## 5 1 2 Collision Female 19 2
# Read excel.csv file (save excel file as CSV UTF-8
Car2<-read.csv("C:/Users/LMNig/OneDrive/Documents/School/Car_Survey_2b.csv")
## Display the structure of Car1 (data frame)
str(Car2)
## 'data.frame': 1049 obs. of 9 variables:
## $ Respondents: chr "Res1" "Res2" "Res3" "Res4" ...
## $ Region : chr "European" "European" "European" "European" ...
## $ Model : chr "Ford Expedition" "Ford Expedition" "Ford Expedition" "Ford Expedition" ...
## $ MPG : int 15 15 15 15 15 15 15 15 15 15 ...
## $ Cyl : int 8 8 8 8 8 8 8 8 8 8 ...
## $ acc1 : num 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 ...
## $ C_cost. : num 16 16 16 16 16 16 16 16 16 16 ...
## $ H_Cost : num 14 14 14 14 14 14 14 14 14 14 ...
## $ Post.Satis : int 4 3 5 5 5 3 3 6 3 5 ...
## Display the first few rows of Car1 (data frame)
head(Car2,n=5)
## Respondents Region Model MPG Cyl acc1 C_cost. H_Cost Post.Satis
## 1 Res1 European Ford Expedition 15 8 5.5 16 14 4
## 2 Res2 European Ford Expedition 15 8 5.5 16 14 3
## 3 Res3 European Ford Expedition 15 8 5.5 16 14 5
## 4 Res4 European Ford Expedition 15 8 5.5 16 14 5
## 5 Res5 European Ford Expedition 15 8 5.5 16 14 5
# Create a Master Dataset (combined car_survey1a and car_survey_2b)
# Rename unique ID in Car2 to match with Car1
## Renaming the first column in Car2 as "Resp"
## (to match with the first column name in Car1)
names(Car2)[1]<-c("Resp")
head(Car2, n=1)
## Resp Region Model MPG Cyl acc1 C_cost. H_Cost Post.Satis
## 1 Res1 European Ford Expedition 15 8 5.5 16 14 4
# Merge Car_Survey_1a and Car_Survey_2b into one dataset
Car_Total<-merge(Car1,Car2, by="Resp")
str(Car_Total)
## 'data.frame': 1049 obs. of 30 variables:
## $ Resp : chr "Res1" "Res10" "Res100" "Res1000" ...
## $ Att_1 : int 6 6 6 6 6 3 2 7 2 6 ...
## $ Att_2 : int 6 6 7 6 6 1 2 7 1 6 ...
## $ Enj_1 : int 6 4 7 7 7 4 1 7 2 6 ...
## $ Enj_2 : int 6 4 3 6 6 3 2 6 1 5 ...
## $ Perform_1 : int 5 4 5 6 6 5 2 5 2 5 ...
## $ Perform_2 : int 6 4 6 6 6 6 2 6 2 5 ...
## $ Perform_3 : int 3 1 6 6 6 6 1 5 2 5 ...
## $ WOM_1 : int 3 5 3 6 4 2 6 6 7 3 ...
## $ WOM_2 : int 3 6 5 6 4 6 7 6 7 3 ...
## $ Futu_Pur_1 : int 3 6 6 6 4 6 6 6 7 6 ...
## $ Futu_Pur_2 : int 3 6 6 6 6 6 5 7 7 6 ...
## $ Valu_Percp_1: int 5 6 7 4 5 5 4 6 4 5 ...
## $ Valu_Percp_2: int 2 6 6 6 6 4 4 5 6 6 ...
## $ Pur_Proces_1: int 6 6 5 6 6 5 4 5 6 6 ...
## $ Pur_Proces_2: int 4 6 5 3 7 5 5 5 7 5 ...
## $ Residence : int 2 1 2 2 1 1 1 2 1 2 ...
## $ Pay_Meth : int 2 2 1 3 3 3 3 3 3 3 ...
## $ Insur_Type : chr "Collision" "Collision" "Collision" "Liability" ...
## $ Gender : chr "Male" "Male" "Female" "Female" ...
## $ Age : int 18 21 32 24 24 25 26 26 27 27 ...
## $ Education : int 2 2 1 2 2 2 2 2 2 2 ...
## $ Region : chr "European" "European" "American" "Asian" ...
## $ Model : chr "Ford Expedition" "Ford Expedition" "Toyota Rav4" "Toyota Corolla" ...
## $ MPG : int 15 15 24 26 26 26 26 26 26 26 ...
## $ Cyl : int 8 8 4 4 4 4 4 4 4 4 ...
## $ acc1 : num 5.5 5.5 8.2 8 8 8 8 8 8 8 ...
## $ C_cost. : num 16 16 10 7 7 7 7 7 7 7 ...
## $ H_Cost : num 14 14 8 6 6 6 6 6 6 6 ...
## $ Post.Satis : int 4 5 4 6 5 6 5 6 7 6 ...
head(Car_Total)
## Resp Att_1 Att_2 Enj_1 Enj_2 Perform_1 Perform_2 Perform_3 WOM_1 WOM_2
## 1 Res1 6 6 6 6 5 6 3 3 3
## 2 Res10 6 6 4 4 4 4 1 5 6
## 3 Res100 6 7 7 3 5 6 6 3 5
## 4 Res1000 6 6 7 6 6 6 6 6 6
## 5 Res1001 6 6 7 6 6 6 6 4 4
## 6 Res1002 3 1 4 3 5 6 6 2 6
## Futu_Pur_1 Futu_Pur_2 Valu_Percp_1 Valu_Percp_2 Pur_Proces_1 Pur_Proces_2
## 1 3 3 5 2 6 4
## 2 6 6 6 6 6 6
## 3 6 6 7 6 5 5
## 4 6 6 4 6 6 3
## 5 4 6 5 6 6 7
## 6 6 6 5 4 5 5
## Residence Pay_Meth Insur_Type Gender Age Education Region Model
## 1 2 2 Collision Male 18 2 European Ford Expedition
## 2 1 2 Collision Male 21 2 European Ford Expedition
## 3 2 1 Collision Female 32 1 American Toyota Rav4
## 4 2 3 Liability Female 24 2 Asian Toyota Corolla
## 5 1 3 Liability Female 24 2 Asian Toyota Corolla
## 6 1 3 Liability Female 25 2 Asian Toyota Corolla
## MPG Cyl acc1 C_cost. H_Cost Post.Satis
## 1 15 8 5.5 16 14 4
## 2 15 8 5.5 16 14 5
## 3 24 4 8.2 10 8 4
## 4 26 4 8.0 7 6 6
## 5 26 4 8.0 7 6 5
## 6 26 4 8.0 7 6 6
# Save the merged data (Car_Total) to a file
## save as CSV
write.csv(Car_Total, "Car_Total", row.names=FALSE)
View(Car_Total)
# Using readxl Package
## Load the package
library(readxl)
# Replace "file_path.xlsx" with the path to your file
Car_data_1 <-read_excel("C:/Users/LMNig/OneDrive/Documents/School/Copy of Car_Survey_1.xlsx")
Car_data_2 <-read_excel ("C:/Users/LMNig/OneDrive/Documents/School/Copy of Car_Survey_2.xlsx")
#Summary of Key Analysis Variables
# Create a summary of the selected variables
summary(Car_Total[c("Futu_Pur_1", "Futu_Pur_2")])
## Futu_Pur_1 Futu_Pur_2
## Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:5.000
## Median :6.000 Median :6.000
## Mean :5.321 Mean :5.371
## 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :9.000 Max. :7.000
## NA's :5 NA's :2
# Create a summary of the selected variables
summary(Car_Total[c("Enj_1", "Enj_2")])
## Enj_1 Enj_2
## Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:3.000
## Median :6.000 Median :5.000
## Mean :5.378 Mean :4.575
## 3rd Qu.:7.000 3rd Qu.:6.000
## Max. :7.000 Max. :7.000
## NA's :4 NA's :4
# Create a summary of the selected variables
summary(Car_Total[c("Perform_1", "Perform_2", "Perform_3")])
## Perform_1 Perform_2 Perform_3
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:3.000
## Median :5.000 Median :5.000 Median :5.000
## Mean :4.947 Mean :4.831 Mean :4.217
## 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :7.000 Max. :7.000 Max. :7.000
## NA's :2 NA's :4 NA's :1
# Create a summary of the selected variables
summary(Car_Total[c("Pur_Proces_1", "Pur_Proces_2")])
## Pur_Proces_1 Pur_Proces_2
## Min. :1.000 Min. :1.000
## 1st Qu.:5.000 1st Qu.:4.000
## Median :6.000 Median :5.000
## Mean :5.256 Mean :4.923
## 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :7.000 Max. :7.000
## NA's :3 NA's :4
# Create a summary of the selected variables
summary(Car_Total[c("Post.Satis")])
## Post.Satis
## Min. :2.00
## 1st Qu.:5.00
## Median :6.00
## Mean :5.28
## 3rd Qu.:6.00
## Max. :7.00
# Count NA values before replacement
na_counts_before <- colSums(is.na(Car_Total))
cat("NA counts before replacement:\n")
## NA counts before replacement:
print(na_counts_before)
## Resp Att_1 Att_2 Enj_1 Enj_2 Perform_1
## 0 4 0 4 4 2
## Perform_2 Perform_3 WOM_1 WOM_2 Futu_Pur_1 Futu_Pur_2
## 4 1 1 3 5 2
## Valu_Percp_1 Valu_Percp_2 Pur_Proces_1 Pur_Proces_2 Residence Pay_Meth
## 4 1 3 4 5 0
## Insur_Type Gender Age Education Region Model
## 0 0 0 0 0 0
## MPG Cyl acc1 C_cost. H_Cost Post.Satis
## 0 0 0 0 0 0
unique(Car_Total$Gender)
## [1] "Male" "Female" ""
Car_Total$Gender[Car_Total$Gender == ""] <- NA
# Remove rows with NA in Gender
Car_Total <- Car_Total[!is.na(Car_Total$Gender), ]
#Group by Car Make
library(stringr) #import library
## Warning: package 'stringr' was built under R version 4.4.2
#Seperate model col into two, delimit using space
Car_Total [c('Make', 'Model_v1')] <- str_split_fixed(Car_Total$Model, " ", 2)
#See the two new columns ("Make", and "Make_v1" in Car_Total data file)
View(Car_Total)
#Group by Parent Company
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Car_Total <- Car_Total %>% #Call dataframe and create new column with new groupings
mutate(Parent = case_when(Make == "Buick" ~ "General Motors",
Make == "Chevrolet" ~ "General Motors",
Make == "Chrysler" ~ "Chrysler",
Make == "Dodge" ~ "Chrysler",
Make == "Fiat" ~ "Chrysler",
Make == "Ford" ~ "Ford",
Make == "Honda" ~ "Honda",
Make == "Kia" ~ "Kia",
Make == "Lincoln" ~ "Ford",
Make == "Toyota" ~ "Toyota",
TRUE ~ "Check"))
#Check if the grouping is correct
count(Car_Total, Car_Total$Make, Car_Total$Parent, name = "Freq")
## Car_Total$Make Car_Total$Parent Freq
## 1 Buick General Motors 31
## 2 Chevrolet General Motors 64
## 3 Chrysler Chrysler 169
## 4 Dodge Chrysler 41
## 5 Fiat Chrysler 18
## 6 Ford Ford 202
## 7 Honda Honda 159
## 8 Kia Kia 34
## 9 Lincoln Ford 39
## 10 Toyota Toyota 289
table(Car_Total$Make)
##
## Buick Chevrolet Chrysler Dodge Fiat Ford Honda Kia
## 31 64 169 41 18 202 159 34
## Lincoln Toyota
## 39 289
# Convert Age to numeric
Car_Total$Age <- as.numeric(as.character(Car_Total$Age))
# Create age categories... 1=(18-29yr), 2=(30-49yr), 3=(50yr+)
Car_Total$Age_Category <- cut(Car_Total$Age,
breaks = c(17, 29, 49, Inf),
labels = c("1", "2", "3"),
right = TRUE)
# Result
table(Car_Total$Age_Category)
##
## 1 2 3
## 440 375 231
# Creation of new variables using the average of each
##Average of Futu_Pur_1 and Futu_Pur_2 to create new variable "FPI"
Car_Total$FPI <- rowMeans(Car_Total[, c("Futu_Pur_1", "Futu_Pur_2")], na.rm = TRUE)
# Result
summary(Car_Total$FPI)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 4.500 6.000 5.343 6.000 7.000 1
##Average of Enj_1 and Enj_2 to create new variable "Enjoyment"
Car_Total$Enjoyment <- rowMeans(Car_Total[, c("Enj_1", "Enj_2")], na.rm = TRUE)
# Result
summary(Car_Total$Enjoyment)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 4.000 5.500 4.976 6.000 7.000 1
##Average of Perform_1, Perform_2 and Perform_3 to create new variable "Performance"
Car_Total$Performance <- rowMeans(Car_Total[, c("Perform_1", "Perform_2", "Perform_3")], na.rm = TRUE)
# Result
summary(Car_Total$Performance)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 4.667 4.661 5.667 7.000
##Average of Pur_Proces_1 and Pur_Proces_2 to create new variable "PurProcess"
Car_Total$PurProcess <- rowMeans(Car_Total[, c("Pur_Proces_1", "Pur_Proces_2")], na.rm = TRUE)
# Result
summary(Car_Total$PurProcess)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 4.000 5.500 5.092 6.000 7.000 2
# Calculate the average of Post.Satis to create new variable "satisfactionmean"
Satisfactionmean <- mean(Car_Total$Post.Satis, na.rm = TRUE)
# Display the result
print(Satisfactionmean)
## [1] 5.280115
library(ggplot2)
library(dplyr)
# Filter Car_Total for General Motors and remove NA values in Insurance_Type
gm_data <- Car_Total %>%
filter(Parent == "General Motors" & !is.na(Insur_Type))
# Generate the bar chart
ggplot(gm_data, aes(x = Region, fill = Insur_Type)) +
geom_bar(position = "dodge") +
labs(title = "Distribution of Insurance Types Across Regions for General Motors",
x = "Region",
y = "Count of Insurance Types") +
theme_minimal() +
scale_fill_discrete(name = "Insurance Type")
## Distribution of Payment Methods for General Motors Cars Across
Regions
# Load required libraries
library(ggplot2)
library(dplyr)
# Filter for General Motors cars and recode Payment Methods
gm_cars <- Car_Total %>%
filter(Parent == "General Motors") %>%
mutate(Pay_Meth = factor(Pay_Meth,
levels = c(1, 2, 3),
labels = c("Lease", "Finance", "Cash")))
# Plot the distribution of Payment Methods for General Motors cars across regions
ggplot(gm_cars, aes(x = Region, fill = Pay_Meth)) +
geom_bar(position = "dodge") + # Ensures side-by-side bars for each payment method
theme_minimal() +
labs(
y = "Number of Cars",
title = "Distribution of Payment Methods for General Motors Cars Across Regions",
x = "Region",
fill = "Payment Method"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(ggplot2)
library(dplyr)
# Filter for General Motors vehicles and count occurrences for each Region and Age_Category combination
gm_data <- Car_Total %>%
filter(Parent == "General Motors") %>%
group_by(Region, Age_Category) %>%
summarise(Count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
# Plot the frequency distribution bar chart
ggplot(gm_data, aes(x = Region, y = Count, fill = as.factor(Age_Category))) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Age Distribution Across Regions for General Motors",
x = "Region",
y = "Frequency",
fill = "Age Group") +
theme_minimal()
# Perform an independent samples t-test
boxplot(Car_Total$FPI ~ Car_Total$Gender, col=c(5,7))
shapiro.test(Car_Total$FPI) #normality assumption on dependent variable
##
## Shapiro-Wilk normality test
##
## data: Car_Total$FPI
## W = 0.89679, p-value < 2.2e-16
## p-value of (p-value < 2.2e-16) < 0.05, meaning that the difference is statistically significant and suggest that the data is not normally distributed.
# check normality assumption on residuals
res_aov<-aov(FPI ~ Gender, data=Car_Total)
res_aov
## Call:
## aov(formula = FPI ~ Gender, data = Car_Total)
##
## Terms:
## Gender Residuals
## Sum of Squares 2.1622 1842.1000
## Deg. of Freedom 1 1043
##
## Residual standard error: 1.328968
## Estimated effects may be unbalanced
## 1 observation deleted due to missingness
#histogram
hist(res_aov$residuals) #visually check normaility of residuals
# Bartlett test of homogeneity of variances
bartlett.test(Car_Total$FPI, Car_Total$Gender)
##
## Bartlett test of homogeneity of variances
##
## data: Car_Total$FPI and Car_Total$Gender
## Bartlett's K-squared = 0.81872, df = 1, p-value = 0.3656
## P-value of 0.3656 is greater than 0.05, meaning the variance between the 2 groups is not significantly different. Equal variance is assumed.
# Research Question:
# H0: There is no significant difference in Future Purchase Intention(FPI) between different gender.
#H1: There is a significant difference in Future Purchase Intention (FPI) between different genders.
#two-sided test
#equal variances
t.test(FPI~Gender, data=Car_Total, var.eq=TRUE)
##
## Two Sample t-test
##
## data: FPI by Gender
## t = 1.1065, df = 1043, p-value = 0.2688
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -0.0705663 0.2530372
## sample estimates:
## mean in group Female mean in group Male
## 5.385231 5.293996
## p-value of 0.2688 >0.05, means we fail to reject the null hypothesis. Thus, there is no significant difference in Future Purchase Intention(FPI) between genders.
# Research Question:
# H0: There is no significant difference in Enjoyment between different genders for General Motors customers.
#H1: There is a significant difference in Enjoyment between different genders for General Motors customers.
#two-sided test
install.packages("dplyr")
## Warning: package 'dplyr' is in use and will not be installed
library(dplyr)
install.packages("stringr")
## Warning: package 'stringr' is in use and will not be installed
library(stringr)
# Filter Car_Total for rows where the Parent column contains "General Motors"
subgeneralmotors <- Car_Total[stringr::str_detect(Car_Total$Parent, "General Motors"), ]
# Display the first 10 rows to avoid excessive output in your knit document
head(subgeneralmotors, 10)
## Resp Att_1 Att_2 Enj_1 Enj_2 Perform_1 Perform_2 Perform_3 WOM_1 WOM_2
## 674 Res660 7 6 7 7 6 2 5 2 6
## 675 Res661 7 7 7 6 6 6 5 6 7
## 676 Res662 4 4 3 4 4 2 2 3 3
## 677 Res663 5 4 3 3 5 5 3 5 3
## 678 Res664 6 6 5 6 6 5 5 6 6
## 679 Res665 5 5 5 5 5 5 5 6 6
## 680 Res666 6 5 5 4 5 6 5 6 6
## 681 Res667 6 6 7 6 6 5 5 4 5
## 682 Res668 6 6 7 3 5 6 5 4 4
## 683 Res669 6 7 7 6 7 7 7 7 7
## Futu_Pur_1 Futu_Pur_2 Valu_Percp_1 Valu_Percp_2 Pur_Proces_1 Pur_Proces_2
## 674 5 3 4 5 5 5
## 675 6 6 6 1 2 7
## 676 3 3 2 2 3 1
## 677 5 4 5 3 4 4
## 678 4 4 5 4 5 5
## 679 2 2 2 4 4 4
## 680 4 4 4 7 6 1
## 681 4 5 5 2 3 6
## 682 5 6 4 3 3 4
## 683 5 5 5 5 5 5
## Residence Pay_Meth Insur_Type Gender Age Education Region
## 674 1 2 Collision Male 32 2 American
## 675 1 3 Collision Male 32 2 American
## 676 1 3 Comprehensive Female 32 2 American
## 677 1 3 Comprehensive Female 34 2 American
## 678 1 3 Comprehensive Male 34 3 American
## 679 1 3 Comprehensive Male 34 2 American
## 680 1 1 Comprehensive Male 34 2 American
## 681 1 3 Comprehensive Female 35 2 American
## 682 1 3 Comprehensive Male 35 2 American
## 683 1 2 Comprehensive Male 36 1 European
## Model MPG Cyl acc1 C_cost. H_Cost Post.Satis Make Model_v1
## 674 Chevrolet Camaro 14 8 4.1 14 12 4 Chevrolet Camaro
## 675 Chevrolet Camaro 14 8 4.1 14 12 6 Chevrolet Camaro
## 676 Chevrolet Camaro 14 8 4.1 14 12 5 Chevrolet Camaro
## 677 Chevrolet Camaro 14 8 4.1 14 12 3 Chevrolet Camaro
## 678 Chevrolet Camaro 14 8 4.1 14 12 6 Chevrolet Camaro
## 679 Chevrolet Camaro 14 8 4.1 14 12 3 Chevrolet Camaro
## 680 Chevrolet Camaro 14 8 4.1 14 12 5 Chevrolet Camaro
## 681 Chevrolet Camaro 14 8 4.1 14 12 6 Chevrolet Camaro
## 682 Chevrolet Camaro 14 8 4.1 14 12 3 Chevrolet Camaro
## 683 Chevrolet Camaro 14 8 4.1 14 12 5 Chevrolet Camaro
## Parent Age_Category FPI Enjoyment Performance PurProcess
## 674 General Motors 2 4.0 7.0 4.333333 5.0
## 675 General Motors 2 6.0 6.5 5.666667 4.5
## 676 General Motors 2 3.0 3.5 2.666667 2.0
## 677 General Motors 2 4.5 3.0 4.333333 4.0
## 678 General Motors 2 4.0 5.5 5.333333 5.0
## 679 General Motors 2 2.0 5.0 5.000000 4.0
## 680 General Motors 2 4.0 4.5 5.333333 3.5
## 681 General Motors 2 4.5 6.5 5.333333 4.5
## 682 General Motors 2 5.5 5.0 5.333333 3.5
## 683 General Motors 2 5.0 6.5 7.000000 5.0
# Check the frequency of values in the Parent column within the subset
table(subgeneralmotors$Parent)
##
## General Motors
## 95
# Convert gender into numeric values
subgeneralmotors <-subgeneralmotors %>%
mutate(Gender_numeric = case_when(Gender == "Female" ~ 1,
Gender == "Male" ~ 0))
t.test(FPI~Gender_numeric, data=subgeneralmotors, var.eq=TRUE)
##
## Two Sample t-test
##
## data: FPI by Gender_numeric
## t = -1.2805, df = 93, p-value = 0.2036
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.9990851 0.2157518
## sample estimates:
## mean in group 0 mean in group 1
## 4.908333 5.300000
## Since the p-value (0.2036) is greater than the significance level (0.05), we do not have sufficient evidence to reject the null hypothesis. This result suggests there is no statistically significant difference in the means of FPI between the two gender groups in this sample.
Car_Total_Subset <- Car_Total %>% filter(Parent %in% c("General Motors", "Ford"))
t.test(FPI ~ Parent, data = Car_Total_Subset, var.equal = TRUE)
##
## Two Sample t-test
##
## data: FPI by Parent
## t = 3.0124, df = 334, p-value = 0.002789
## alternative hypothesis: true difference in means between group Ford and group General Motors is not equal to 0
## 95 percent confidence interval:
## 0.1674813 0.7977949
## sample estimates:
## mean in group Ford mean in group General Motors
## 5.535270 5.052632
## The p-value is 0.002789, which is below the common significance level of 0.05. This indicates that the difference in FPI scores between Ford and General Motors is statistically significant.
## Ford’s higher mean FPI could indicate better performance or satisfaction relative to General Motors on this metric.
#Step 1: show count, mean, variance, standard deviation
stat_table <- group_by(Car_Total, Car_Total$Age_Category) %>% summarise(count = n(),
mean=mean(Enjoyment, na.rm=TRUE),
var=var(Enjoyment, na.rm = TRUE),
sd=sd(Enjoyment, na.rm = TRUE))
print(stat_table)
## # A tibble: 3 × 5
## `Car_Total$Age_Category` count mean var sd
## <fct> <int> <dbl> <dbl> <dbl>
## 1 1 440 4.88 2.56 1.60
## 2 2 375 5.12 2.13 1.46
## 3 3 231 4.92 2.23 1.49
#Step 2: ANOVA Test
#Check Normality Assumption
tapply(Car_Total$Enjoyment, Car_Total$Age_Category,shapiro.test)
## $`1`
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.92628, p-value = 7.174e-14
##
##
## $`2`
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.92538, p-value = 1.007e-12
##
##
## $`3`
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.93475, p-value = 1.304e-08
#indicates data is not normally distributed because p-value < 0.05
#Check homogeneity of variances in the dependent variable across conditions
bartlett.test(Car_Total$Enjoyment, Car_Total$Age_Category)
##
## Bartlett test of homogeneity of variances
##
## data: Car_Total$Enjoyment and Car_Total$Age_Category
## Bartlett's K-squared = 3.649, df = 2, p-value = 0.1613
#the assumption of homogeneity of variances holds in this case
#ANOVA
aov_enjoyment_equal <- aov(Enjoyment ~ as.factor(Age_Category), data = Car_Total)
summary(aov_enjoyment_equal)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Age_Category) 2 13.1 6.568 2.818 0.0602 .
## Residuals 1042 2428.8 2.331
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
##The p-value (0.0602) is just above 0.05, so we fail to reject the null hypothesis. This suggests that there is no statistically significant difference in Enjoyment across Age_Category groups at the 5% significance level.
#Step 3: Perform Post-Hoc Test (Tukey HSD Test)
## pairwise comparisons between group means
TukeyHSD(aov_enjoyment_equal)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Enjoyment ~ as.factor(Age_Category), data = Car_Total)
##
## $`as.factor(Age_Category)`
## diff lwr upr p adj
## 2-1 0.24700683 -0.004962447 0.49897611 0.0561157
## 3-1 0.04724926 -0.244011811 0.33851033 0.9232135
## 3-2 -0.19975758 -0.499465327 0.09995018 0.2616634
plot(TukeyHSD(aov_enjoyment_equal)) #Visualize the pairwise results
# None of the pairwise comparisons show statistically significant differences in Enjoyment between age categories at the 5% significance level.This suggests that age category may not have a substantial effect on Enjoyment in this dataset.
#One-Way ANOVA test (2) ## Independent Variable: Age ## Dependent Variable: Car Purchase Process (PurProcess)
#Step 1: show count, mean, variance, standard deviation
stat_table <- group_by(Car_Total, Car_Total$Age_Category) %>% summarise(count = n(),
mean=mean(PurProcess, na.rm=TRUE),
var=var(PurProcess, na.rm = TRUE),
sd=sd(PurProcess, na.rm = TRUE))
print(stat_table)
## # A tibble: 3 × 5
## `Car_Total$Age_Category` count mean var sd
## <fct> <int> <dbl> <dbl> <dbl>
## 1 1 440 5.19 1.28 1.13
## 2 2 375 4.93 1.63 1.28
## 3 3 231 5.16 1.34 1.16
#Step 2: ANOVA Test
#Check Normality Assumption
tapply(Car_Total$PurProcess, Car_Total$Age_Category,shapiro.test)
## $`1`
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.94162, p-value = 3.922e-12
##
##
## $`2`
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.95523, p-value = 3.147e-09
##
##
## $`3`
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.93374, p-value = 1.063e-08
#indicates data is not normally distributed because p-value < 0.05
#Check homogeneity of variances in the dependent variable across conditions
bartlett.test(Car_Total$PurProcess, Car_Total$Age_Category)
##
## Bartlett test of homogeneity of variances
##
## data: Car_Total$PurProcess and Car_Total$Age_Category
## Bartlett's K-squared = 6.3108, df = 2, p-value = 0.04262
# The assumption of homogeneity of variances does not hold for PurProcess across age categories. p-value of 0.04262 < 0.05
#ANOVA test with no equal variance assumption
oneway.test(PurProcess ~ as.factor(Age_Category), data = Car_Total)
##
## One-way analysis of means (not assuming equal variances)
##
## data: PurProcess and as.factor(Age_Category)
## F = 4.8055, num df = 2.00, denom df = 590.47, p-value = 0.008508
##The p-value (0.008508) is less than the common significance level of 0.05, we reject the null hypothesis. The test indicates that Age_Category has a significant effect on PurProcess, and there are likely meaningful differences in mean PurProcess scores among the different age categories.
#Step 3: Perform Pairwise t-test with no assumption of variance
## pairwise comparisons between group means
pairwise.t.test(Car_Total$PurProcess, Car_Total$Age_Category, p.adjust.methods = "BH", pool.sd = FALSE)
##
## Pairwise comparisons using t tests with non-pooled SD
##
## data: Car_Total$PurProcess and Car_Total$Age_Category
##
## 1 2
## 2 0.009 -
## 3 0.778 0.048
##
## P value adjustment method: holm
## There is a significant difference in PurProcess between Age_Category 1 and Age_Category 2 (p = 0.009). Also, There is a significant difference between Age_Category 2 and Age_Category 3 (p = 0.048).
#Pearson Correlation Analysis
#Correlation matrix: correlations for multiple variables
Car_Total_Sub <- Car_Total %>% select(Enjoyment, Performance, Post.Satis, PurProcess)
cor(Car_Total_Sub, use = "complete.obs")
## Enjoyment Performance Post.Satis PurProcess
## Enjoyment 1.000000000 0.66289655 0.11794940 0.007145688
## Performance 0.662896550 1.00000000 0.09348855 0.040020059
## Post.Satis 0.117949396 0.09348855 1.00000000 0.110896075
## PurProcess 0.007145688 0.04002006 0.11089607 1.000000000
#Round correlation coefficients to 2 decimal places
cor_matrix <- cor(Car_Total_Sub, use = "complete.obs")
cor_matrix_rounded <- round(cor_matrix, 2)
print(cor_matrix_rounded)
## Enjoyment Performance Post.Satis PurProcess
## Enjoyment 1.00 0.66 0.12 0.01
## Performance 0.66 1.00 0.09 0.04
## Post.Satis 0.12 0.09 1.00 0.11
## PurProcess 0.01 0.04 0.11 1.00
#Interpretation of Coefficient Correlation:
##Enjoyment and Performance: The correlation is 0.66, indicating a moderate positive correlation. This suggests that as Enjoyment increases, Performance tends to increase as well.
##Enjoyment and Post.Satis: The correlation is 0.12, a weak positive correlation, indicating a slight tendency for Post.Satis to increase as Enjoyment increases.
##Enjoyment and PurProcess: The correlation is 0.01, which is near zero, indicating no meaningful relationship between Enjoyment and PurProcess.
##Performance and Post.Satis: The correlation is 0.09, also a weak positive correlation, indicating minimal association.
##Performance and PurProcess: The correlation is 0.04, which is very weak, suggesting almost no linear relationship between Performance and PurProcess.
##Post.Satis and PurProcess: The correlation is 0.11, another weak positive correlation, showing minimal association.
#Multiple Linear Regression - Across all car brands and segments. ## Is there a linear relationship between customers Future Purchase Intention (FPI) together with ## Enjoyment, Performance, Post Satisfaction, and Car Purchases Process ratings.
data_lm<-Car_Total
library(ggplot2)
install.packages("visreg")
## Installing package into 'C:/Users/LMNig/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'visreg' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LMNig\AppData\Local\Temp\Rtmp23XEkD\downloaded_packages
library(visreg)
## Warning: package 'visreg' was built under R version 4.4.2
## H0: The coefficients of all predictor variables (Enjoyment, Performance, Post.Satis, and PurProcess) are equal to zero. This means that none of the predictors have a significant effect on FPI.
## H1: At least one of the predictor variables (Enjoyment, Performance, Post.Satis, or PurProcess) has a coefficient that is not equal to zero, meaning that at least one predictor has a significant effect on FPI.
model2<-lm(FPI ~ Enjoyment+Performance+Post.Satis+PurProcess, data=data_lm)
summary(model2)
##
## Call:
## lm(formula = FPI ~ Enjoyment + Performance + Post.Satis + PurProcess,
## data = data_lm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8432 -0.5381 0.1598 0.7369 2.7452
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.31034 0.24039 5.451 6.26e-08 ***
## Enjoyment -0.02915 0.03143 -0.927 0.35399
## Performance 0.11822 0.03818 3.097 0.00201 **
## Post.Satis 0.36072 0.02946 12.246 < 2e-16 ***
## PurProcess 0.33838 0.03021 11.201 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.159 on 1038 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.2433, Adjusted R-squared: 0.2404
## F-statistic: 83.43 on 4 and 1038 DF, p-value: < 2.2e-16
# Check for multicollinearity among independent variables
install.packages("car")
## Installing package into 'C:/Users/LMNig/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'car' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LMNig\AppData\Local\Temp\Rtmp23XEkD\downloaded_packages
library(car)
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.2
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(model2)
## Enjoyment Performance Post.Satis PurProcess
## 1.796133 1.788303 1.026965 1.014554
#All VIF values are well below 5, meaning that multicollinearity is not a concern
#Check Linearity of residuals
plot(model2, which=1)
#Since the plot shows a random scatter around 0, the linearity assumption is likely met.
#Check normality of residuals
##Use a QQ plot
plot(model2, which=2)
#points shows some normality, but not perfectly so may effect analysis.
#Using Shapiro-Wilk Test
shapiro.test(residuals(model2))
##
## Shapiro-Wilk normality test
##
## data: residuals(model2)
## W = 0.96937, p-value = 5.017e-14
#The results indicate that the residuals of model2 do not follow a normal distribution. This deviation from normality may impact the validity of certain assumptions during the regression analysis.
#Check Homoscedasticity (Constant Variance)
plot(model2, which=3)
#May be additional cause for concern. Points are fairly equally spread around the horizontal line, but not perfect.