# Import Excel dataset (ensure it is in the same folder as this .Rmd)
data <- read_excel("Statistics project Data Collection .xlsx")
# Standardize column names
colnames(data) <- make.names(colnames(data))
# Display column names to verify structure
print(colnames(data))
## [1] "Person.." "Pick.ups"
glimpse(data)
## Rows: 50
## Columns: 2
## $ Person.. <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ Pick.ups <dbl> 90, 120, 170, 154, 124, 175, 143, 124, 115, 110, 101, 85, 190…
# Extract the Phone Pickups column explicitly
# If printed name differs, replace Phone.Pickups below with the correct version
x <- data %>%
pull(Pick.ups) %>%
as.numeric() %>%
discard(is.na)
# Verify numeric vector
summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.0 100.0 116.5 117.8 127.8 230.0
length(x)
## [1] 50
mean_val <- mean(x)
median_val <- median(x)
sd_val <- sd(x)
var_val <- var(x)
range_val <- range(x)
iqr_val <- IQR(x)
five <- quantile(x, c(0, .25, .5, .75, 1))
cat("Mean:", mean_val, "\n")
## Mean: 117.78
cat("Median:", median_val, "\n")
## Median: 116.5
cat("Standard Deviation:", sd_val, "\n")
## Standard Deviation: 46.28187
cat("Variance:", var_val, "\n")
## Variance: 2142.012
cat("Range:", range_val, "\n")
## Range: 20 230
cat("IQR:", iqr_val, "\n")
## IQR: 27.75
cat("Five-number summary:", five, "\n")
## Five-number summary: 20 100 116.5 127.75 230
df <- data.frame(x = x)
ggplot(df, aes(x)) +
geom_histogram(aes(y = ..density..), bins = 25, color = "black") +
geom_density(linewidth = 1) +
labs(title = "Histogram with Density of Daily Phone Pickups",
x = "Daily Phone Pickups", y = "Density") +
theme_minimal()
ggsave("hist.png", dpi = 300, width = 6, height = 4)
ggplot(df, aes(y = x)) +
geom_boxplot(color = "black") +
labs(title = "Boxplot of Daily Phone Pickups", x = "", y = "Daily Pickups") +
theme_minimal()
ggsave("box.png", dpi = 300, width = 6, height = 4)
ggplot(df, aes(x = "", y = x)) +
geom_violin(trim = FALSE) +
geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.6) +
labs(title = "Violin + Dot Plot of Daily Phone Pickups", x = "", y = "Daily Pickups") +
theme_minimal()
ggsave("violin.png", dpi = 300, width = 6, height = 4)
z_max <- (max(x) - mean(x)) / sd(x)
z_min <- (min(x) - mean(x)) / sd(x)
cat("Z-Score for Max:", z_max, "\n")
## Z-Score for Max: 2.424707
cat("Z-Score for Min:", z_min, "\n")
## Z-Score for Min: -2.112706
if (abs(z_max) > 3 | abs(z_min) > 3) {
cat("Potential outlier(s) detected using the Z-score rule (|z| > 3).\n")
} else {
cat("No extreme outliers detected via Z-score method.\n")
}
## No extreme outliers detected via Z-score method.
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
I <- IQR(x)
LF <- Q1 - 1.5 * I
UF <- Q3 + 1.5 * I
cat("Lower Fence:", LF, "\nUpper Fence:", UF, "\n")
## Lower Fence: 58.375
## Upper Fence: 169.375
which(x < LF | x > UF)
## [1] 3 6 13 14 18 19 21 22 23 26 32 34 40 41 48
outliers <- x[x < LF | x > UF]
outliers
## [1] 170 175 190 230 55 23 20 45 200 189 184 190 43 22 198
if (mean_val > median_val) {
shape <- "Right-skewed (positive)"
} else if (mean_val < median_val) {
shape <- "Left-skewed (negative)"
} else {
shape <- "Approximately symmetric"
}
cat("Distribution Shape:", shape, "\n")
## Distribution Shape: Right-skewed (positive)
Interpretation:
- Shape: Determined by comparing mean vs median and
histogram pattern.
- Outliers: Identified by IQR or Z-score methods.
- Impact: Outliers may slightly raise the mean and
stretch the right tail.
- Center & Spread: Mean ≈ Median → balanced
distribution; SD & IQR show variability of phone pickups among
students.
write.csv(data, "project2_data.csv", row.names = FALSE)