# Libraries used
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
For this report I was given two sets of data to analyze from the statcan website.
# Read the file
df <- read.csv("surveydata.csv", header = T)
The first set of data was taken from a group of people who were asked, since the beginning of the Covid-19 pandemic, which of the following steps did you take when shopping online?
The steps were as follows:
The second set of data was taken from a group of people who were asked, during the Covid-19 pandemic, did your household’s use of the following free online services increase decrease or remain about the same compared to before the pandemic?
The free online services were as follows:
The group of people who answered each of these questions regarding Covid-19 were also asked various questions regarding their demographic.
These questions are as follows:
With each of these questions answered their was finally enough data to start to preform data analysis.
Before I could start analyzing the data their was lots of data preprocessing to do. The first thing I had to do was remove all the data from the data set that was not useful to me. This means that I needed to keep only the columns of data that related to the two data sets I was assigned and the demographic data. I used the following code to keep only the 24 columns that I needed to do the report.
# Keep only the columns I need
df <-select(df,PCS_45A:CPD_05F,HHLDSIZC:PERS_WGT)
Next, within the data their were lots of inconsistencies and values I would not be able to use. Some of these inconsistencies were the fact that some yes or no answers were written as 0’s and 1’s while others were written as 1’s and 2’s. Some of the data that was unusable were written down as 9’s, 6’s or 4’s. All of which stood for NA because the person answering the question didn’t have an answer or it didn’t apply to them. In order to fix these problems I coded multiple functions that would turn all values that were 9, 6, or 4 into NA and change any values that were a 2 to a 0.
# Function to recode all 2,6,9 to NA
colRecode269 <- function(startCol, endCol) {
for (i in startCol:endCol) {
df[, i] <<- ifelse(df[, i] == 2, 0, df[, i])
df[, i] <<- ifelse(df[, i] == 6, NA, df[, i])
df[, i] <<- ifelse(df[, i] == 9, NA, df[, i])
}
}
# Function to recode all 9,4 to NA
colRecode94 <- function(startCol, endCol) {
for (i in startCol:endCol) {
df[, i] <<- ifelse(df[, i] == 9, NA, df[, i])
df[, i] <<- ifelse(df[, i] == 4, NA, df[, i])
}
}
# Function to recode all 9 to NA
colRecode9 <- function(startCol, endCol) {
for (i in startCol:endCol) {
df[, i] <<- ifelse(df[, i] == 9, NA, df[, i])
}
}
# Function to recode all 2,6,9 to NA
recode269Tab <- function(stColName, endColName) {
x <- which(colnames(df) == stColName)
y <- which(colnames(df) == endColName)
colRecode269(x, y)
df <<- df
}
# Function to recode all 9,4 to NA
recode94Tab <- function(stColName, endColName) {
x <- which(colnames(df) == stColName)
y <- which(colnames(df) == endColName)
colRecode94(x, y)
df <<- df
}
# Function to recode all 9 to NA
recode9Tab <- function(stColName, endColName) {
x <- which(colnames(df) == stColName)
y <- which(colnames(df) == endColName)
colRecode9(x, y)
df <<- df
}
Next, the titles on each of the columns that represented each of the questions being asked was very hard to determine and confusing to read so I decided to rename each of the columns. After renaming each set of columns I also called on the functions I previously made and recoded all 2’s, 4’s, and 6’s to NA and all 2’s to 0.
# Behavior during shopping online (PCS_45)
df <- rename(df, noPersInfo = PCS_45A )
df <- rename(df, noCdtCard = PCS_45B )
df <- rename(df, onlyRptdSite = PCS_45C)
df <- rename(df, lowCrdtAmt = PCS_45D )
df <- rename(df, thirdPartyPmt = PCS_45E )
df <- rename(df, lookedHTTPS = PCS_45F )
df <- rename(df, usedStrongPswd = PCS_45G )
df <- rename(df, noShopOnline = PCS_45H )
recode269Tab('noPersInfo', 'noShopOnline')
# Household use of the following - How did it change (CPD_05)
df <- rename(df, useSocMedia = CPD_05A )
df <- rename(df, useVideoStrm = CPD_05B )
df <- rename(df, useAudioStrm = CPD_05C )
df <- rename(df, usePrdctServcs = CPD_05D )
df <- rename(df, useInfoServcs = CPD_05E )
df <- rename(df, useEduServcs = CPD_05F )
recode94Tab('useSocMedia', 'useEduServcs')
# Demographic variables
df <- rename(df, FamilySize = HHLDSIZC )
df <- rename(df, AgeGroup = AGEGRP )
df <- rename(df, isCanadaBorn = IMMIGRNC )
df <- rename(df, isMale = SEX )
df <- rename(df, Under18 = PCHILD )
df <- rename(df, MaritalStatus = MARSTATC )
df <- rename(df, dwellingType = PDWELCDC )
df <- rename(df, highestEd = PEDUC_LC )
df <- rename(df, isRural = RURURB )
df <- rename(df, weight = PERS_WGT)
# Recode some demographic data
df$isCanadaBorn <- ifelse(df$isCanadaBorn == 2, 0, 1)
df$isMale <- ifelse(df$isMale == 2, 0, 1)
df$isRural <- ifelse(df$isRural == 2, 0, 1)
recode9Tab('FamilySize','isRural')
Finally, I identified how many NA values were in the data and removed them all from the dataset.
# Identify missing values
sapply(df, function(x) sum(is.na(x)))
## noPersInfo noCdtCard onlyRptdSite lowCrdtAmt thirdPartyPmt
## 62 62 62 62 62
## lookedHTTPS usedStrongPswd noShopOnline useSocMedia useVideoStrm
## 62 62 62 512 708
## useAudioStrm usePrdctServcs useInfoServcs useEduServcs FamilySize
## 1439 2293 316 2299 0
## AgeGroup isCanadaBorn isMale Under18 MaritalStatus
## 0 0 0 0 0
## dwellingType highestEd isRural weight
## 0 0 0 0
# Omits all rows with an NA value
df <- na.omit(df)
Now that all of the data preprocessing was complete it was time to start doing some exploratory analysis. I started by using the kable function from the knitr library to display the head of the dataset. This allowed me to see some example responses from the people who participated in answering the questions.
# Head of Data
hdf <- head(df)
kab <- knitr::kable(hdf, caption = "Data Table",
booktabs = T, label = "kabletable")
kable_classic_2(kab, full_width = F)
| noPersInfo | noCdtCard | onlyRptdSite | lowCrdtAmt | thirdPartyPmt | lookedHTTPS | usedStrongPswd | noShopOnline | useSocMedia | useVideoStrm | useAudioStrm | usePrdctServcs | useInfoServcs | useEduServcs | FamilySize | AgeGroup | isCanadaBorn | isMale | Under18 | MaritalStatus | dwellingType | highestEd | isRural | weight | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 3 | 2 | 2 | 2 | 1 | 2 | 3 | 6 | 1 | 1 | 0 | 1 | 1 | 6 | 1 | 4618.6834 |
| 7 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 4 | 1 | 1 | 0 | 1 | 1 | 4 | 1 | 5863.3536 |
| 14 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 2 | 1 | 2 | 2 | 5 | 1 | 1 | 0 | 4 | 1 | 4 | 1 | 3719.3613 |
| 17 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 5 | 1 | 1 | 0 | 1 | 1 | 6 | 1 | 1371.4025 |
| 28 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 2 | 2 | 2 | 2 | 2 | 5 | 1 | 1 | 0 | 2 | 1 | 7 | 0 | 755.6321 |
| 31 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 2 | 1 | 2 | 2 | 4 | 0 | 1 | 1 | 1 | 1 | 6 | 0 | 15897.3233 |
Next, I coded a function that would display a frequency table every time its called upon and used it to make a frequency table for all the variables within the dataset. This way I could see how people answered each question and how many people answered yes(1) or no(0) and increased(1), about the same(2), or decreased(3) regarding each question.
# Function to create a freq table
colFreqTable <- function(startCol, endCol) {
for (i in startCol:endCol) {
cat("-----------------\n")
cat(colnames(df[i]),"\n")
print(table(df[, i]))
cat("\n-----------------")
}
}
# Function to create a freq table
createfreqtable <- function(stColName, endColName) {
x <- which(colnames(df) == stColName)
y <- which(colnames(df) == endColName)
colFreqTable(x, y)
df <<- df
}
# Create freq table for each column
createfreqtable('noPersInfo','isRural')
## -----------------
## noPersInfo
##
## 0 1
## 736 400
##
## ----------------------------------
## noCdtCard
##
## 0 1
## 634 502
##
## ----------------------------------
## onlyRptdSite
##
## 0 1
## 281 855
##
## ----------------------------------
## lowCrdtAmt
##
## 0 1
## 914 222
##
## ----------------------------------
## thirdPartyPmt
##
## 0 1
## 836 300
##
## ----------------------------------
## lookedHTTPS
##
## 0 1
## 780 356
##
## ----------------------------------
## usedStrongPswd
##
## 0 1
## 708 428
##
## ----------------------------------
## noShopOnline
##
## 0 1
## 1004 132
##
## ----------------------------------
## useSocMedia
##
## 1 2 3
## 497 603 36
##
## ----------------------------------
## useVideoStrm
##
## 1 2 3
## 626 488 22
##
## ----------------------------------
## useAudioStrm
##
## 1 2 3
## 392 715 29
##
## ----------------------------------
## usePrdctServcs
##
## 1 2 3
## 201 892 43
##
## ----------------------------------
## useInfoServcs
##
## 1 2 3
## 435 688 13
##
## ----------------------------------
## useEduServcs
##
## 1 2 3
## 373 726 37
##
## ----------------------------------
## FamilySize
##
## 1 2 3 4 5
## 295 639 127 58 17
##
## ----------------------------------
## AgeGroup
##
## 1 2 3 4 5 6 7
## 93 226 297 237 164 90 29
##
## ----------------------------------
## isCanadaBorn
##
## 0 1
## 230 906
##
## ----------------------------------
## isMale
##
## 0 1
## 548 588
##
## ----------------------------------
## Under18
##
## 0 1
## 706 430
##
## ----------------------------------
## MaritalStatus
##
## 1 2 3 4
## 565 133 138 300
##
## ----------------------------------
## dwellingType
##
## 1 2 3 4
## 726 127 90 193
##
## ----------------------------------
## highestEd
##
## 1 2 3 4 5 6 7
## 42 203 79 248 37 337 190
##
## ----------------------------------
## isRural
##
## 0 1
## 936 200
##
## -----------------
Next, I used the stargazer function from the stargazer library to display some summary statistics regarding the data. Some of the statistics include: number of values, mean, standard deviation, min,and max. I also used the dim function to display how many rows and columns were in the dataset.
# Display min, max, mean, median, 1st and 3nd quartiles
stargazer(df,type = 'text',title = 'Summary Statistics', digits = 2)
##
## Summary Statistics
## =========================================================
## Statistic N Mean St. Dev. Min Max
## ---------------------------------------------------------
## noPersInfo 1,136 0.35 0.48 0 1
## noCdtCard 1,136 0.44 0.50 0 1
## onlyRptdSite 1,136 0.75 0.43 0 1
## lowCrdtAmt 1,136 0.20 0.40 0 1
## thirdPartyPmt 1,136 0.26 0.44 0 1
## lookedHTTPS 1,136 0.31 0.46 0 1
## usedStrongPswd 1,136 0.38 0.48 0 1
## noShopOnline 1,136 0.12 0.32 0 1
## useSocMedia 1,136 1.59 0.55 1 3
## useVideoStrm 1,136 1.47 0.54 1 3
## useAudioStrm 1,136 1.68 0.52 1 3
## usePrdctServcs 1,136 1.86 0.44 1 3
## useInfoServcs 1,136 1.63 0.51 1 3
## useEduServcs 1,136 1.70 0.52 1 3
## FamilySize 1,136 2.00 0.84 1 5
## AgeGroup 1,136 3.47 1.49 1 7
## isCanadaBorn 1,136 0.80 0.40 0 1
## isMale 1,136 0.52 0.50 0 1
## Under18 1,136 0.38 0.49 0 1
## MaritalStatus 1,136 2.15 1.29 1 4
## dwellingType 1,136 1.78 1.16 1 4
## highestEd 1,136 4.59 1.87 1 7
## isRural 1,136 0.18 0.38 0 1
## weight 1,136 9,113.84 13,612.85 253.48 196,544.10
## ---------------------------------------------------------
# Displays the rows and columns
dim(df)
## [1] 1136 24
Next, I used the stargazer function again to make and display a correlation matrix and a variance matrix. I also used the heatmap function to make and display a heatmap of the correlation matrix. With each of these analysis I would be able to see the correlation and relationships between each of the variables within the dataset.
# Correlation Matrix
correlation.matrix <- round(cor(df),2)
stargazer(correlation.matrix, header=FALSE, type="html", title="Correlation Matrix", digits = 2)
| noPersInfo | noCdtCard | onlyRptdSite | lowCrdtAmt | thirdPartyPmt | lookedHTTPS | usedStrongPswd | noShopOnline | useSocMedia | useVideoStrm | useAudioStrm | usePrdctServcs | useInfoServcs | useEduServcs | FamilySize | AgeGroup | isCanadaBorn | isMale | Under18 | MaritalStatus | dwellingType | highestEd | isRural | weight | |
| noPersInfo | 1 | 0.64 | 0.20 | 0.23 | 0.04 | 0.16 | 0.27 | -0.27 | 0 | -0.07 | -0.03 | -0.04 | -0.06 | -0.09 | 0.07 | -0.05 | -0.06 | -0.05 | 0.03 | -0.03 | 0.01 | 0.01 | -0.04 | 0.02 |
| noCdtCard | 0.64 | 1 | 0.26 | 0.20 | 0.12 | 0.22 | 0.30 | -0.32 | -0.03 | -0.09 | -0.07 | -0.06 | -0.08 | -0.06 | 0.05 | -0.07 | -0.02 | -0.12 | 0.05 | -0.04 | -0.02 | 0.07 | -0.04 | -0.02 |
| onlyRptdSite | 0.20 | 0.26 | 1 | 0.14 | 0.10 | 0.23 | 0.23 | -0.63 | -0.09 | -0.08 | -0.08 | -0.06 | -0.08 | -0.05 | 0.03 | -0.11 | 0.01 | -0.06 | 0.08 | -0.04 | 0.04 | 0.12 | -0.07 | -0.02 |
| lowCrdtAmt | 0.23 | 0.20 | 0.14 | 1 | 0.02 | 0.11 | 0.18 | -0.18 | 0.01 | 0 | -0.07 | 0 | -0.07 | -0.04 | -0.04 | -0.01 | -0.04 | -0.02 | 0.04 | 0.02 | 0.01 | -0.03 | -0.02 | -0.01 |
| thirdPartyPmt | 0.04 | 0.12 | 0.10 | 0.02 | 1 | 0.18 | 0.21 | -0.22 | -0.02 | -0.07 | -0.01 | 0 | -0.01 | -0.04 | -0.06 | -0.08 | -0.01 | 0.03 | 0.01 | 0.04 | 0.03 | -0.03 | -0.04 | -0.01 |
| lookedHTTPS | 0.16 | 0.22 | 0.23 | 0.11 | 0.18 | 1 | 0.40 | -0.24 | -0.01 | -0.11 | -0.09 | -0.08 | -0.06 | -0.06 | 0.01 | -0.10 | 0.01 | 0.04 | 0.03 | -0.03 | 0.02 | 0.06 | -0.02 | -0.01 |
| usedStrongPswd | 0.27 | 0.30 | 0.23 | 0.18 | 0.21 | 0.40 | 1 | -0.28 | 0.04 | -0.04 | -0.08 | -0.06 | -0.04 | -0.04 | -0.03 | -0.10 | 0.03 | 0.02 | 0.06 | 0 | 0.03 | 0.07 | -0.06 | 0 |
| noShopOnline | -0.27 | -0.32 | -0.63 | -0.18 | -0.22 | -0.24 | -0.28 | 1 | 0.08 | 0.14 | 0.12 | 0.08 | 0.12 | 0.12 | -0.05 | 0.18 | 0.03 | 0.04 | -0.11 | 0.04 | -0.02 | -0.12 | 0.09 | -0.01 |
| useSocMedia | 0 | -0.03 | -0.09 | 0.01 | -0.02 | -0.01 | 0.04 | 0.08 | 1 | 0.48 | 0.29 | 0.26 | 0.40 | 0.19 | -0.08 | 0.19 | 0.03 | 0.17 | -0.11 | -0.03 | 0.04 | -0.01 | 0.04 | -0.05 |
| useVideoStrm | -0.07 | -0.09 | -0.08 | 0 | -0.07 | -0.11 | -0.04 | 0.14 | 0.48 | 1 | 0.46 | 0.28 | 0.43 | 0.30 | -0.08 | 0.21 | 0.04 | 0.08 | -0.13 | -0.01 | 0 | -0.03 | 0.10 | -0.08 |
| useAudioStrm | -0.03 | -0.07 | -0.08 | -0.07 | -0.01 | -0.09 | -0.08 | 0.12 | 0.29 | 0.46 | 1 | 0.32 | 0.37 | 0.28 | -0.06 | 0.18 | 0.03 | 0.10 | -0.08 | -0.04 | -0.01 | -0.02 | 0.03 | -0.06 |
| usePrdctServcs | -0.04 | -0.06 | -0.06 | 0 | 0 | -0.08 | -0.06 | 0.08 | 0.26 | 0.28 | 0.32 | 1 | 0.33 | 0.34 | -0.04 | 0.15 | 0.07 | 0.08 | -0.08 | -0.06 | -0.03 | -0.03 | 0.01 | -0.06 |
| useInfoServcs | -0.06 | -0.08 | -0.08 | -0.07 | -0.01 | -0.06 | -0.04 | 0.12 | 0.40 | 0.43 | 0.37 | 0.33 | 1 | 0.33 | -0.05 | 0.08 | 0.08 | 0.08 | -0.09 | -0.03 | 0.02 | 0.01 | 0.06 | -0.07 |
| useEduServcs | -0.09 | -0.06 | -0.05 | -0.04 | -0.04 | -0.06 | -0.04 | 0.12 | 0.19 | 0.30 | 0.28 | 0.34 | 0.33 | 1 | -0.12 | 0.15 | 0.11 | 0.09 | -0.25 | 0.01 | 0.03 | -0.01 | 0.04 | -0.08 |
| FamilySize | 0.07 | 0.05 | 0.03 | -0.04 | -0.06 | 0.01 | -0.03 | -0.05 | -0.08 | -0.08 | -0.06 | -0.04 | -0.05 | -0.12 | 1 | -0.12 | -0.06 | -0.05 | 0.18 | -0.30 | -0.19 | -0.16 | 0.01 | 0.35 |
| AgeGroup | -0.05 | -0.07 | -0.11 | -0.01 | -0.08 | -0.10 | -0.10 | 0.18 | 0.19 | 0.21 | 0.18 | 0.15 | 0.08 | 0.15 | -0.12 | 1 | -0.02 | 0.14 | -0.23 | -0.30 | -0.09 | 0.04 | 0.12 | -0.24 |
| isCanadaBorn | -0.06 | -0.02 | 0.01 | -0.04 | -0.01 | 0.01 | 0.03 | 0.03 | 0.03 | 0.04 | 0.03 | 0.07 | 0.08 | 0.11 | -0.06 | -0.02 | 1 | -0.03 | -0.10 | 0.09 | -0.15 | -0.12 | 0.12 | -0.12 |
| isMale | -0.05 | -0.12 | -0.06 | -0.02 | 0.03 | 0.04 | 0.02 | 0.04 | 0.17 | 0.08 | 0.10 | 0.08 | 0.08 | 0.09 | -0.05 | 0.14 | -0.03 | 1 | -0.06 | -0.05 | -0.06 | 0.04 | -0.03 | 0.04 |
| Under18 | 0.03 | 0.05 | 0.08 | 0.04 | 0.01 | 0.03 | 0.06 | -0.11 | -0.11 | -0.13 | -0.08 | -0.08 | -0.09 | -0.25 | 0.18 | -0.23 | -0.10 | -0.06 | 1 | -0.30 | -0.09 | -0.02 | -0.07 | 0.11 |
| MaritalStatus | -0.03 | -0.04 | -0.04 | 0.02 | 0.04 | -0.03 | 0 | 0.04 | -0.03 | -0.01 | -0.04 | -0.06 | -0.03 | 0.01 | -0.30 | -0.30 | 0.09 | -0.05 | -0.30 | 1 | 0.13 | -0.13 | -0.05 | 0.08 |
| dwellingType | 0.01 | -0.02 | 0.04 | 0.01 | 0.03 | 0.02 | 0.03 | -0.02 | 0.04 | 0 | -0.01 | -0.03 | 0.02 | 0.03 | -0.19 | -0.09 | -0.15 | -0.06 | -0.09 | 0.13 | 1 | 0.06 | -0.15 | 0.05 |
| highestEd | 0.01 | 0.07 | 0.12 | -0.03 | -0.03 | 0.06 | 0.07 | -0.12 | -0.01 | -0.03 | -0.02 | -0.03 | 0.01 | -0.01 | -0.16 | 0.04 | -0.12 | 0.04 | -0.02 | -0.13 | 0.06 | 1 | -0.14 | -0.25 |
| isRural | -0.04 | -0.04 | -0.07 | -0.02 | -0.04 | -0.02 | -0.06 | 0.09 | 0.04 | 0.10 | 0.03 | 0.01 | 0.06 | 0.04 | 0.01 | 0.12 | 0.12 | -0.03 | -0.07 | -0.05 | -0.15 | -0.14 | 1 | -0.09 |
| weight | 0.02 | -0.02 | -0.02 | -0.01 | -0.01 | -0.01 | 0 | -0.01 | -0.05 | -0.08 | -0.06 | -0.06 | -0.07 | -0.08 | 0.35 | -0.24 | -0.12 | 0.04 | 0.11 | 0.08 | 0.05 | -0.25 | -0.09 | 1 |
# Heatmap
heatmap(cor(df), Rowv = NA, Colv = NA)
# Variance Matrix
variance.matrix <- round(var(df),2)
stargazer(variance.matrix, header=FALSE, type="html", title="Variance Matrix", digits = 2)
| noPersInfo | noCdtCard | onlyRptdSite | lowCrdtAmt | thirdPartyPmt | lookedHTTPS | usedStrongPswd | noShopOnline | useSocMedia | useVideoStrm | useAudioStrm | usePrdctServcs | useInfoServcs | useEduServcs | FamilySize | AgeGroup | isCanadaBorn | isMale | Under18 | MaritalStatus | dwellingType | highestEd | isRural | weight | |
| noPersInfo | 0.23 | 0.15 | 0.04 | 0.04 | 0.01 | 0.04 | 0.06 | -0.04 | 0 | -0.02 | -0.01 | -0.01 | -0.02 | -0.02 | 0.03 | -0.03 | -0.01 | -0.01 | 0.01 | -0.02 | 0 | 0.01 | -0.01 | 146.05 |
| noCdtCard | 0.15 | 0.25 | 0.06 | 0.04 | 0.03 | 0.05 | 0.07 | -0.05 | -0.01 | -0.02 | -0.02 | -0.01 | -0.02 | -0.02 | 0.02 | -0.05 | 0 | -0.03 | 0.01 | -0.03 | -0.01 | 0.07 | -0.01 | -157.96 |
| onlyRptdSite | 0.04 | 0.06 | 0.19 | 0.02 | 0.02 | 0.05 | 0.05 | -0.09 | -0.02 | -0.02 | -0.02 | -0.01 | -0.02 | -0.01 | 0.01 | -0.07 | 0 | -0.01 | 0.02 | -0.02 | 0.02 | 0.10 | -0.01 | -132.68 |
| lowCrdtAmt | 0.04 | 0.04 | 0.02 | 0.16 | 0 | 0.02 | 0.04 | -0.02 | 0 | 0 | -0.01 | 0 | -0.01 | -0.01 | -0.01 | -0.01 | -0.01 | 0 | 0.01 | 0.01 | 0 | -0.02 | 0 | -54.40 |
| thirdPartyPmt | 0.01 | 0.03 | 0.02 | 0 | 0.19 | 0.04 | 0.05 | -0.03 | -0.01 | -0.02 | 0 | 0 | 0 | -0.01 | -0.02 | -0.05 | 0 | 0.01 | 0 | 0.02 | 0.01 | -0.02 | -0.01 | -32.58 |
| lookedHTTPS | 0.04 | 0.05 | 0.05 | 0.02 | 0.04 | 0.22 | 0.09 | -0.04 | 0 | -0.03 | -0.02 | -0.02 | -0.01 | -0.01 | 0 | -0.07 | 0 | 0.01 | 0.01 | -0.02 | 0.01 | 0.05 | 0 | -79.35 |
| usedStrongPswd | 0.06 | 0.07 | 0.05 | 0.04 | 0.05 | 0.09 | 0.24 | -0.04 | 0.01 | -0.01 | -0.02 | -0.01 | -0.01 | -0.01 | -0.01 | -0.07 | 0.01 | 0.01 | 0.01 | 0 | 0.02 | 0.06 | -0.01 | -20.92 |
| noShopOnline | -0.04 | -0.05 | -0.09 | -0.02 | -0.03 | -0.04 | -0.04 | 0.10 | 0.01 | 0.02 | 0.02 | 0.01 | 0.02 | 0.02 | -0.01 | 0.08 | 0 | 0.01 | -0.02 | 0.02 | -0.01 | -0.07 | 0.01 | -44.02 |
| useSocMedia | 0 | -0.01 | -0.02 | 0 | -0.01 | 0 | 0.01 | 0.01 | 0.30 | 0.14 | 0.08 | 0.06 | 0.11 | 0.06 | -0.04 | 0.16 | 0.01 | 0.05 | -0.03 | -0.02 | 0.02 | -0.01 | 0.01 | -375.10 |
| useVideoStrm | -0.02 | -0.02 | -0.02 | 0 | -0.02 | -0.03 | -0.01 | 0.02 | 0.14 | 0.29 | 0.13 | 0.07 | 0.12 | 0.08 | -0.03 | 0.17 | 0.01 | 0.02 | -0.03 | -0.01 | 0 | -0.03 | 0.02 | -617.96 |
| useAudioStrm | -0.01 | -0.02 | -0.02 | -0.01 | 0 | -0.02 | -0.02 | 0.02 | 0.08 | 0.13 | 0.27 | 0.07 | 0.10 | 0.08 | -0.03 | 0.14 | 0.01 | 0.03 | -0.02 | -0.03 | 0 | -0.02 | 0.01 | -422.23 |
| usePrdctServcs | -0.01 | -0.01 | -0.01 | 0 | 0 | -0.02 | -0.01 | 0.01 | 0.06 | 0.07 | 0.07 | 0.20 | 0.07 | 0.08 | -0.02 | 0.10 | 0.01 | 0.02 | -0.02 | -0.03 | -0.02 | -0.03 | 0 | -374.32 |
| useInfoServcs | -0.02 | -0.02 | -0.02 | -0.01 | 0 | -0.01 | -0.01 | 0.02 | 0.11 | 0.12 | 0.10 | 0.07 | 0.26 | 0.09 | -0.02 | 0.06 | 0.02 | 0.02 | -0.02 | -0.02 | 0.01 | 0.01 | 0.01 | -453.95 |
| useEduServcs | -0.02 | -0.02 | -0.01 | -0.01 | -0.01 | -0.01 | -0.01 | 0.02 | 0.06 | 0.08 | 0.08 | 0.08 | 0.09 | 0.27 | -0.05 | 0.11 | 0.02 | 0.02 | -0.06 | 0 | 0.02 | -0.01 | 0.01 | -566.39 |
| FamilySize | 0.03 | 0.02 | 0.01 | -0.01 | -0.02 | 0 | -0.01 | -0.01 | -0.04 | -0.03 | -0.03 | -0.02 | -0.02 | -0.05 | 0.71 | -0.14 | -0.02 | -0.02 | 0.08 | -0.32 | -0.19 | -0.26 | 0 | 4,030.57 |
| AgeGroup | -0.03 | -0.05 | -0.07 | -0.01 | -0.05 | -0.07 | -0.07 | 0.08 | 0.16 | 0.17 | 0.14 | 0.10 | 0.06 | 0.11 | -0.14 | 2.21 | -0.01 | 0.10 | -0.17 | -0.58 | -0.15 | 0.11 | 0.07 | -4,904.61 |
| isCanadaBorn | -0.01 | 0 | 0 | -0.01 | 0 | 0 | 0.01 | 0 | 0.01 | 0.01 | 0.01 | 0.01 | 0.02 | 0.02 | -0.02 | -0.01 | 0.16 | -0.01 | -0.02 | 0.05 | -0.07 | -0.09 | 0.02 | -683.87 |
| isMale | -0.01 | -0.03 | -0.01 | 0 | 0.01 | 0.01 | 0.01 | 0.01 | 0.05 | 0.02 | 0.03 | 0.02 | 0.02 | 0.02 | -0.02 | 0.10 | -0.01 | 0.25 | -0.01 | -0.03 | -0.03 | 0.04 | 0 | 253.73 |
| Under18 | 0.01 | 0.01 | 0.02 | 0.01 | 0 | 0.01 | 0.01 | -0.02 | -0.03 | -0.03 | -0.02 | -0.02 | -0.02 | -0.06 | 0.08 | -0.17 | -0.02 | -0.01 | 0.24 | -0.18 | -0.05 | -0.02 | -0.01 | 721.59 |
| MaritalStatus | -0.02 | -0.03 | -0.02 | 0.01 | 0.02 | -0.02 | 0 | 0.02 | -0.02 | -0.01 | -0.03 | -0.03 | -0.02 | 0 | -0.32 | -0.58 | 0.05 | -0.03 | -0.18 | 1.65 | 0.19 | -0.32 | -0.02 | 1,465.67 |
| dwellingType | 0 | -0.01 | 0.02 | 0 | 0.01 | 0.01 | 0.02 | -0.01 | 0.02 | 0 | 0 | -0.02 | 0.01 | 0.02 | -0.19 | -0.15 | -0.07 | -0.03 | -0.05 | 0.19 | 1.35 | 0.13 | -0.07 | 732.54 |
| highestEd | 0.01 | 0.07 | 0.10 | -0.02 | -0.02 | 0.05 | 0.06 | -0.07 | -0.01 | -0.03 | -0.02 | -0.03 | 0.01 | -0.01 | -0.26 | 0.11 | -0.09 | 0.04 | -0.02 | -0.32 | 0.13 | 3.50 | -0.10 | -6,357.33 |
| isRural | -0.01 | -0.01 | -0.01 | 0 | -0.01 | 0 | -0.01 | 0.01 | 0.01 | 0.02 | 0.01 | 0 | 0.01 | 0.01 | 0 | 0.07 | 0.02 | 0 | -0.01 | -0.02 | -0.07 | -0.10 | 0.15 | -461.78 |
| weight | 146.05 | -157.96 | -132.68 | -54.40 | -32.58 | -79.35 | -20.92 | -44.02 | -375.10 | -617.96 | -422.23 | -374.32 | -453.95 | -566.39 | 4,030.57 | -4,904.61 | -683.87 | 253.73 | 721.59 | 1,465.67 | 732.54 | -6,357.33 | -461.78 | 185,309,803.00 |
Using the data from the correlation matrix I was able to determine which variables had a strong correlation with each other. Using this knowledge I decided to combine all the columns that had relation to the use of free online services and all the columns that had relation to steps taken when shopping online. For the columns involving steps taken while shopping online I was just able to add all of the values from each of the columns together to get the total number of steps taken while shopping online per person.
# Finds the sum of steps taken per person
df <- mutate(df, sumSteps = noPersInfo + noCdtCard + onlyRptdSite +
lowCrdtAmt + thirdPartyPmt + lookedHTTPS +
usedStrongPswd + noShopOnline)
For the columns involving free online services used I wasn’t able to add them right away because the answers were being measure on a scale from 1-3 which wouldn’t add together properly to get the result I want. So to fix this I recoded each of the columns so that each 3 that appears becomes a -1 since the 3 stands for decreases, each 2 that appears becomes a 0 since the 2 stands for about the same, and each 1 will remain unchanged because 1 stands for increases.
# Recodes 3 (Decreases) to -1, 2 (About the same) to 0, 1(increases) stays at 1
df$useSocMedia <- ifelse(df$useSocMedia == 3, -1, df$useSocMedia)
df$useSocMedia <- ifelse(df$useSocMedia == 2, 0, df$useSocMedia)
df$useVideoStrm <- ifelse(df$useVideoStrm == 3, -1, df$useVideoStrm)
df$useVideoStrm <- ifelse(df$useVideoStrm == 2, 0, df$useVideoStrm)
df$useAudioStrm <- ifelse(df$useAudioStrm == 3, -1, df$useAudioStrm)
df$useAudioStrm <- ifelse(df$useAudioStrm == 2, 0, df$useAudioStrm)
df$usePrdctServcs <- ifelse(df$usePrdctServcs == 3, -1, df$usePrdctServcs)
df$usePrdctServcs <- ifelse(df$usePrdctServcs == 2, 0, df$usePrdctServcs)
df$useInfoServcs <- ifelse(df$useInfoServcs == 3, -1, df$useInfoServcs)
df$useInfoServcs <- ifelse(df$useInfoServcs == 2, 0, df$useInfoServcs)
df$useEduServcs <- ifelse(df$useEduServcs == 3, -1, df$useEduServcs)
df$useEduServcs <- ifelse(df$useEduServcs == 2, 0, df$useEduServcs)
Now with the data changed I will be able to add all of the values from each of the columns together and get the result I’m looking for.
# Finds the sum of services used per person
df <- mutate(df, sumServcs = useSocMedia + useVideoStrm + useAudioStrm +
usePrdctServcs + useInfoServcs + useEduServcs)
I also added 6 to each of the values within the newly made ‘sumServcs’ column to ensure that all of the values are positive values and are able to be used for certain types of analysis.
# Add 6 to all values so that none of the values are negative
df$sumServcs <- df$sumServcs + 6
I also created a frequency table for each of my newly made columns to see the results of adding all those values together. The results should represent how many steps a person took while shopping online ranging from 0 to 8 since their were a total of 8 steps a person could take and how many free online services they started using more or less often ranging from 0 to 12, 6 being about the same, 0-5 being decreased, and 7-12 being increased.
# Creates a freq table for sumSteps and sumServcs
createfreqtable('sumSteps','sumServcs')
## -----------------
## sumSteps
##
## 1 2 3 4 5 6 7
## 362 204 180 174 130 63 23
##
## ----------------------------------
## sumServcs
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 4 3 2 6 6 19 295 158 164 183 146 79 71
##
## -----------------
Now that I had combined my data into two new columns, I could start focusing on analyzing that data and the demographic data and come up with some research questions and hypothesis.
Some of the research questions I came up with are as follows:
Now that the research question have been established the main focus will be on analyzing the data in a way that will help to draw us closer to answering the research questions.
Using the hist function, I was able to make and display a histogram for each of the two new columns, ‘sumSteps’ and ‘sumServcs’. This graph will allow me to have a visual representation of the frequency of each outcome and see the differential between the steps people took and the services they used.
# Create histograms for specified columns
hist(df$sumSteps)
hist(df$sumServcs)
Using the boxplot function, I was able to make and display a box and whisker plot for each of the two new columns, ‘sumSteps’ and ‘sumServcs’. This plot will allow me to see if their are any outliers in the data that I need to be wary of.
# Create box plots for specified columns
boxplot(df$sumSteps)
boxplot(df$sumServcs)
Now that the exploratory analysis is finished its time to start doing some predictive analysis and work towards answering the research questions.The two main forms of predictive analysis are linear regression and logistic regression. Since we already combined the previous variables into the sums of steps taken and services used we can use those variables for linear regression. For logistic regression we are going to need to make two new variables that are binomial. To accomplish this I created two new variables one for the steps taken and one for services used. I did this by stating any values larger or equal to 3 in the ‘sumSteps’ column will be made a 1, any values less than 3 will become a 0. This would help us discover the answer to one of our research questions because the people who are labled as a 1 would be the people who took 3 or more steps while shopping online. I also took any value greater than 6 in the ‘sumServcs’ column and made it a 1 while anything equal to or less than 6 would be a 0. This would also help us solve one of our research questions because anyone with a score of 1 would mean they experienced an increase in using free online services.
# Creates a dummy variable that states if sumSteps taken by shoppers is greater than or equal to 3, than the variables becomes a 1 (3+ steps taken), otherwise the variable becomes a 0 (less than 3 steps taken)
df$stepsDummy <- ifelse(df$sumSteps >= 3, 1, 0)
# Creates a dummy variables that states if sumServcs used by people is greater than 0, than the variable becomes a 1 (use of services increased), otherwise the variable becomes a 0 (use of services decreased or remained unchanged)
df$servcsDummy <- ifelse(df$sumServcs > 6, 1, 0)
Using the glm function, I was able to perform linear regression on ‘sumSteps’ and ‘sumServcs’ as dependent variables while I used all off the demographic factors as independent variables. Using the p-values from the regression analysis I was able to determine which demographic factors were significant and run the linear regression again but this time with only significant variables.
lin.1 <- glm(sumSteps ~ sumServcs + FamilySize + AgeGroup + isCanadaBorn + isMale + Under18 +
MaritalStatus + highestEd + isRural + weight, data = df, family = poisson)
lin.2 <- glm(sumSteps ~ sumServcs + AgeGroup, data = df, family = poisson)
lin.3 <- glm(sumServcs ~ sumSteps + FamilySize + AgeGroup + isCanadaBorn + isMale + Under18 +
MaritalStatus + highestEd + isRural + weight, data = df, family = poisson)
lin.4 <- glm(sumServcs ~ AgeGroup + isMale + Under18, data = df, family = poisson)
Using the glm function again, I was able to perform logistic regression this time using the ‘stepsDummy’ and ‘servcsDummy’ variables since they are binomial. The binomial variables would be used as the dependent variables while the demographic factors would be used as the independent variables. Using the p-values once again from the logistic regression analysis I was able to determine which demographic factors were significant and run the logistic regression again but this time with only significant variables.
log.1 <- glm(stepsDummy ~ sumServcs + FamilySize + AgeGroup + isCanadaBorn + isMale + Under18 +
MaritalStatus + highestEd + isRural + weight, data = df, family = binomial)
log.2 <- glm(stepsDummy ~ sumServcs + AgeGroup + highestEd, data = df, family = binomial)
log.3 <- glm(servcsDummy ~ sumSteps + FamilySize + AgeGroup + isCanadaBorn + isMale + Under18 +
MaritalStatus + highestEd + isRural + weight, data = df, family = binomial)
log.4 <- glm(servcsDummy ~ sumSteps + AgeGroup + isMale + Under18 +
highestEd + isRural, data = df, family = binomial)
Using the stargazer function, I was able to display my linear and logistic regressions in a table where I can easily see, interpret, and compare the values of the linear and logistic regressions. In this table I could see the significance of each variable, the amount of observations, the coefficients, and many other useful types of information.
stargazer(lin.1, lin.2, log.1, log.2, header = FALSE,
title = 'Regression analysis for steps taken while shopping online',
type = 'text', digits = 2,
dep.var.labels.include = FALSE,
model.names = FALSE, model.numbers = FALSE,
column.labels = c('Linear','Logistic'),
column.separate = c(2,2),
style = 'aer',
covariate.labels = c('Sum of Services', 'Family Size', 'Age Group',
'Born in Canada', 'Is a Male','Is Under 18','Marital Status',
'Highest Education','Lives in Rural Area','Weight'))
##
## Regression analysis for steps taken while shopping online
## =============================================================
## Linear Logistic
## -------------------------------------------------------------
## Sum of Services 0.02*** 0.03*** 0.09*** 0.09***
## (0.01) (0.01) (0.03) (0.03)
##
## Family Size -0.01 -0.00
## (0.02) (0.08)
##
## Age Group -0.04*** -0.04*** -0.11** -0.10**
## (0.01) (0.01) (0.05) (0.04)
##
## Born in Canada 0.00 0.05
## (0.05) (0.16)
##
## Is a Male -0.02 -0.03
## (0.04) (0.12)
##
## Is Under 18 0.01 -0.02
## (0.04) (0.14)
##
## Marital Status -0.02 -0.06
## (0.02) (0.06)
##
## Highest Education 0.01 0.08** 0.09***
## (0.01) (0.03) (0.03)
##
## Lives in Rural Area -0.07 -0.22
## (0.05) (0.16)
##
## Weight -0.00 -0.00
## (0.00) (0.00)
##
## Constant 1.02*** 0.95*** -0.53 -0.84**
## (0.15) (0.09) (0.52) (0.34)
##
## Observations 1,136 1,136 1,136 1,136
## Log Likelihood -2,125.44 -2,130.26 -771.81 -773.48
## Akaike Inf. Crit. 4,272.88 4,266.52 1,565.62 1,554.95
## -------------------------------------------------------------
## Notes: ***Significant at the 1 percent level.
## **Significant at the 5 percent level.
## *Significant at the 10 percent level.
stargazer(lin.3, lin.4, log.3, log.4, header = FALSE,
title = 'Regression analysis for use of free online services',
type = 'text', digits = 2,
dep.var.labels.include = FALSE,
model.names = FALSE, model.numbers = FALSE,
column.labels = c('Linear','Logistic'),
column.separate = c(2,2),
style = 'aer',
covariate.labels = c('Sum of Services', 'Family Size', 'Age Group',
'Born in Canada', 'Is a Male','Is Under 18','Marital Status',
'Highest Education','Lives in Rural Area','Weight'))
##
## Regression analysis for use of free online services
## =============================================================
## Linear Logistic
## -------------------------------------------------------------
## Sum of Services 0.01* 0.14*** 0.14***
## (0.01) (0.04) (0.04)
##
## Family Size 0.02 0.12
## (0.01) (0.10)
##
## Age Group -0.03*** -0.03*** -0.23*** -0.25***
## (0.01) (0.01) (0.05) (0.05)
##
## Born in Canada -0.04* -0.14
## (0.03) (0.18)
##
## Is a Male -0.06*** -0.06*** -0.33** -0.32**
## (0.02) (0.02) (0.14) (0.14)
##
## Is Under 18 0.07*** 0.07*** 0.53*** 0.53***
## (0.02) (0.02) (0.16) (0.15)
##
## Marital Status 0.01 0.05
## (0.01) (0.07)
##
## Highest Education 0.01 0.11*** 0.10***
## (0.01) (0.04) (0.04)
##
## Lives in Rural Area -0.02 -0.35** -0.39**
## (0.03) (0.18) (0.17)
##
## Weight 0.00 0.00
## (0.00) (0.00)
##
## Constant 2.09*** 2.21*** 0.55 1.04***
## (0.08) (0.03) (0.53) (0.29)
##
## Observations 1,136 1,136 1,136 1,136
## Log Likelihood -2,506.30 -2,512.15 -635.38 -637.72
## Akaike Inf. Crit. 5,034.59 5,032.30 1,292.75 1,289.44
## -------------------------------------------------------------
## Notes: ***Significant at the 1 percent level.
## **Significant at the 5 percent level.
## *Significant at the 10 percent level.
Some conclusions that can be made regarding some of the exploratory analysis are as follows:
Some conclusions that can be made regarding some of the predictive analysis are as follows:
Some conclusions that can be made regarding the research questions are as follows:
Some recommendations I have are as follows:
# Libraries used
library(dplyr)
library(kableExtra)
library(stargazer)
# Read the file
df <- read.csv("surveydata.csv", header = T)
# Keep only the columns I need
df <-select(df,PCS_45A:CPD_05F,HHLDSIZC:PERS_WGT)
# Function to recode all 2,6,9 to NA
colRecode269 <- function(startCol, endCol) {
for (i in startCol:endCol) {
df[, i] <<- ifelse(df[, i] == 2, 0, df[, i])
df[, i] <<- ifelse(df[, i] == 6, NA, df[, i])
df[, i] <<- ifelse(df[, i] == 9, NA, df[, i])
}
}
# Function to recode all 9,4 to NA
colRecode94 <- function(startCol, endCol) {
for (i in startCol:endCol) {
df[, i] <<- ifelse(df[, i] == 9, NA, df[, i])
df[, i] <<- ifelse(df[, i] == 4, NA, df[, i])
}
}
# Function to recode all 9 to NA
colRecode9 <- function(startCol, endCol) {
for (i in startCol:endCol) {
df[, i] <<- ifelse(df[, i] == 9, NA, df[, i])
}
}
# Function to recode all 2,6,9 to NA
recode269Tab <- function(stColName, endColName) {
x <- which(colnames(df) == stColName)
y <- which(colnames(df) == endColName)
colRecode269(x, y)
df <<- df
}
# Function to recode all 9,4 to NA
recode94Tab <- function(stColName, endColName) {
x <- which(colnames(df) == stColName)
y <- which(colnames(df) == endColName)
colRecode94(x, y)
df <<- df
}
# Function to recode all 9 to NA
recode9Tab <- function(stColName, endColName) {
x <- which(colnames(df) == stColName)
y <- which(colnames(df) == endColName)
colRecode9(x, y)
df <<- df
}
# Behavior during shopping online (PCS_45)
df <- rename(df, noPersInfo = PCS_45A )
df <- rename(df, noCdtCard = PCS_45B )
df <- rename(df, onlyRptdSite = PCS_45C)
df <- rename(df, lowCrdtAmt = PCS_45D )
df <- rename(df, thirdPartyPmt = PCS_45E )
df <- rename(df, lookedHTTPS = PCS_45F )
df <- rename(df, usedStrongPswd = PCS_45G )
df <- rename(df, noShopOnline = PCS_45H )
recode269Tab('noPersInfo', 'noShopOnline')
# Household use of the following - How did it change (CPD_05)
df <- rename(df, useSocMedia = CPD_05A )
df <- rename(df, useVideoStrm = CPD_05B )
df <- rename(df, useAudioStrm = CPD_05C )
df <- rename(df, usePrdctServcs = CPD_05D )
df <- rename(df, useInfoServcs = CPD_05E )
df <- rename(df, useEduServcs = CPD_05F )
recode94Tab('useSocMedia', 'useEduServcs')
# Demographic variables
df <- rename(df, FamilySize = HHLDSIZC )
df <- rename(df, AgeGroup = AGEGRP )
df <- rename(df, isCanadaBorn = IMMIGRNC )
df <- rename(df, isMale = SEX )
df <- rename(df, Under18 = PCHILD )
df <- rename(df, MaritalStatus = MARSTATC )
df <- rename(df, dwellingType = PDWELCDC )
df <- rename(df, highestEd = PEDUC_LC )
df <- rename(df, isRural = RURURB )
df <- rename(df, weight = PERS_WGT)
# Recode some demographic data
df$isCanadaBorn <- ifelse(df$isCanadaBorn == 2, 0, 1)
df$isMale <- ifelse(df$isMale == 2, 0, 1)
df$isRural <- ifelse(df$isRural == 2, 0, 1)
recode9Tab('FamilySize','isRural')
# Identify missing values
sapply(df, function(x) sum(is.na(x)))
# Omits all rows with an NA value
df <- na.omit(df)
# Head of Data
hdf <- head(df)
kab <- knitr::kable(hdf, caption = "Data Table",
booktabs = T, label = "kabletable")
kable_classic_2(kab, full_width = F)
# Function to create a freq table
colFreqTable <- function(startCol, endCol) {
for (i in startCol:endCol) {
cat("-----------------\n")
cat(colnames(df[i]),"\n")
print(table(df[, i]))
cat("\n-----------------")
}
}
# Function to create a freq table
createfreqtable <- function(stColName, endColName) {
x <- which(colnames(df) == stColName)
y <- which(colnames(df) == endColName)
colFreqTable(x, y)
df <<- df
}
# Create freq table for each column
createfreqtable('noPersInfo','isRural')
# Display min, max, mean, median, 1st and 3nd quartiles
stargazer(df,type = 'text',title = 'Summary Statistics', digits = 2)
# Displays the rows and columns
dim(df)
# Correlation Matrix
correlation.matrix <- round(cor(df),2)
stargazer(correlation.matrix, header=FALSE, type="html", title="Correlation Matrix", digits = 2)
# Heatmap
heatmap(cor(df), Rowv = NA, Colv = NA)
# Variance Matrix
variance.matrix <- round(var(df),2)
stargazer(variance.matrix, header=FALSE, type="html", title="Variance Matrix", digits = 2)
# Finds the sum of steps taken per person
df <- mutate(df, sumSteps = noPersInfo + noCdtCard + onlyRptdSite +
lowCrdtAmt + thirdPartyPmt + lookedHTTPS +
usedStrongPswd + noShopOnline)
# Recodes 3 (Decreases) to -1, 2 (About the same) to 0, 1(increases) stays at 1
df$useSocMedia <- ifelse(df$useSocMedia == 3, -1, df$useSocMedia)
df$useSocMedia <- ifelse(df$useSocMedia == 2, 0, df$useSocMedia)
df$useVideoStrm <- ifelse(df$useVideoStrm == 3, -1, df$useVideoStrm)
df$useVideoStrm <- ifelse(df$useVideoStrm == 2, 0, df$useVideoStrm)
df$useAudioStrm <- ifelse(df$useAudioStrm == 3, -1, df$useAudioStrm)
df$useAudioStrm <- ifelse(df$useAudioStrm == 2, 0, df$useAudioStrm)
df$usePrdctServcs <- ifelse(df$usePrdctServcs == 3, -1, df$usePrdctServcs)
df$usePrdctServcs <- ifelse(df$usePrdctServcs == 2, 0, df$usePrdctServcs)
df$useInfoServcs <- ifelse(df$useInfoServcs == 3, -1, df$useInfoServcs)
df$useInfoServcs <- ifelse(df$useInfoServcs == 2, 0, df$useInfoServcs)
df$useEduServcs <- ifelse(df$useEduServcs == 3, -1, df$useEduServcs)
df$useEduServcs <- ifelse(df$useEduServcs == 2, 0, df$useEduServcs)
# Finds the sum of services used per person
df <- mutate(df, sumServcs = useSocMedia + useVideoStrm + useAudioStrm +
usePrdctServcs + useInfoServcs + useEduServcs)
# Add 6 to all values so that none of the values are negative
df$sumServcs <- df$sumServcs + 6
# Creates a freq table for sumSteps and sumServcs
createfreqtable('sumSteps','sumServcs')
# Create histograms for specified columns
hist(df$sumSteps)
hist(df$sumServcs)
# Create box plots for specified columns
boxplot(df$sumSteps)
boxplot(df$sumServcs)
# Creates a dummy variable that states if sumSteps taken by shoppers is greater than or equal to 3, than the variables becomes a 1 (3+ steps taken), otherwise the variable becomes a 0 (less than 3 steps taken)
df$stepsDummy <- ifelse(df$sumSteps >= 3, 1, 0)
# Creates a dummy variables that states if sumServcs used by people is greater than 0, than the variable becomes a 1 (use of services increased), otherwise the variable becomes a 0 (use of services decreased or remained unchanged)
df$servcsDummy <- ifelse(df$sumServcs > 6, 1, 0)
lin.1 <- glm(sumSteps ~ sumServcs + FamilySize + AgeGroup + isCanadaBorn + isMale + Under18 +
MaritalStatus + highestEd + isRural + weight, data = df, family = poisson)
lin.2 <- glm(sumSteps ~ sumServcs + AgeGroup, data = df, family = poisson)
lin.3 <- glm(sumServcs ~ sumSteps + FamilySize + AgeGroup + isCanadaBorn + isMale + Under18 +
MaritalStatus + highestEd + isRural + weight, data = df, family = poisson)
lin.4 <- glm(sumServcs ~ AgeGroup + isMale + Under18, data = df, family = poisson)
log.1 <- glm(stepsDummy ~ sumServcs + FamilySize + AgeGroup + isCanadaBorn + isMale + Under18 +
MaritalStatus + highestEd + isRural + weight, data = df, family = binomial)
log.2 <- glm(stepsDummy ~ sumServcs + AgeGroup + highestEd, data = df, family = binomial)
log.3 <- glm(servcsDummy ~ sumSteps + FamilySize + AgeGroup + isCanadaBorn + isMale + Under18 +
MaritalStatus + highestEd + isRural + weight, data = df, family = binomial)
log.4 <- glm(servcsDummy ~ sumSteps + AgeGroup + isMale + Under18 +
highestEd + isRural, data = df, family = binomial)
stargazer(lin.1, lin.2, log.1, log.2, header = FALSE,
title = 'Regression analysis for steps taken while shopping online',
type = 'text', digits = 2,
dep.var.labels.include = FALSE,
model.names = FALSE, model.numbers = FALSE,
column.labels = c('Linear','Logistic'),
column.separate = c(2,2),
style = 'aer',
covariate.labels = c('Sum of Services', 'Family Size', 'Age Group',
'Born in Canada', 'Is a Male','Is Under 18','Marital Status',
'Highest Education','Lives in Rural Area','Weight'))
stargazer(lin.3, lin.4, log.3, log.4, header = FALSE,
title = 'Regression analysis for use of free online services',
type = 'text', digits = 2,
dep.var.labels.include = FALSE,
model.names = FALSE, model.numbers = FALSE,
column.labels = c('Linear','Logistic'),
column.separate = c(2,2),
style = 'aer',
covariate.labels = c('Sum of Services', 'Family Size', 'Age Group',
'Born in Canada', 'Is a Male','Is Under 18','Marital Status',
'Highest Education','Lives in Rural Area','Weight'))