Read and prep file
# load and read the file
d <- read.csv('/Users/Urvi.Basu/Desktop/project2.data.csv')
# rename the variable
x <- d$ChatUsage
# check numeric type
if(!is.numeric(x)){ stop("ChatUsage is not numeric")}
Calculate descriptives with outliers
mean(x); median(x); sd(x); var(x); range(x); IQR(x)
## [1] 3.86
## [1] 3
## [1] 2.364793
## [1] 5.592245
## [1] 0 10
## [1] 3
summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 2.00 3.00 3.86 5.00 10.00
Outliers
# identify and remove outliers
z_max <- (max(x) - mean(x)) / sd(x)
Q1 <- quantile(x, 0.25); Q3 <- quantile(x, 0.75)
I <- IQR(x); LF <- Q1 - 1.5*I; UF <- Q3 + 1.5*I
Outliers <- which(x < LF | x > UF)
chat_nooutliers <- x[x>= LF & x <= UF]
Outliers
## [1] 13
cat("Z-score of maximum value:", round(z_max, 3), "\n\n")
## Z-score of maximum value: 2.596
cat("Q1:", round(Q1, 3), "\n")
## Q1: 2
cat("Q3:", round(Q3, 3), "\n")
## Q3: 5
cat("IQR:", round(I, 3), "\n\n")
## IQR: 3
cat("Lower Fence:", round(LF, 3), "\n")
## Lower Fence: -2.5
cat("Upper Fence:", round(UF, 3), "\n\n")
## Upper Fence: 9.5
Calculate descriptives without outliers
mean(chat_nooutliers); median(chat_nooutliers); sd(chat_nooutliers); var(chat_nooutliers); range(chat_nooutliers); IQR(chat_nooutliers)
## [1] 3.734694
## [1] 3
## [1] 2.215246
## [1] 4.907313
## [1] 0 9
## [1] 3
summary(chat_nooutliers)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 3.735 5.000 9.000
Plots
# Plots (ggplot2)
# histogram
library(ggplot2)
ggplot(data.frame(chat_nooutliers), aes(chat_nooutliers)) +
geom_histogram(aes(y= after_stat(density)), bins=30) +
geom_density()

# box plot
ggplot(data.frame(chat_nooutliers), aes(chat_nooutliers="", y=chat_nooutliers)) + geom_boxplot()

# violin plot
ggplot(data.frame(x), aes(x="", y=x)) +
geom_violin() +
geom_dotplot(binaxis="y", stackdir="center")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
