1. Import Data and Prepare

# Import Excel dataset (ensure it is in the same folder as this .Rmd)
data <- read_excel("Statistics project Data Collection .xlsx")

# Standardize column names
colnames(data) <- make.names(colnames(data))

# Display column names to verify structure
print(colnames(data))
## [1] "Person.." "Pick.ups"
glimpse(data)
## Rows: 50
## Columns: 2
## $ Person.. <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ Pick.ups <dbl> 90, 120, 170, 154, 124, 175, 143, 124, 115, 110, 101, 85, 190…
# Extract the Phone Pickups column explicitly
# If printed name differs, replace Phone.Pickups below with the correct version
x <- data %>%
  pull(Pick.ups) %>%
  as.numeric() %>%
  discard(is.na)

# Verify numeric vector
summary(x)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    20.0   100.0   116.5   117.8   127.8   230.0
length(x)
## [1] 50

2. Descriptive Statistics

mean_val <- mean(x)
median_val <- median(x)
sd_val <- sd(x)
var_val <- var(x)
range_val <- range(x)
iqr_val <- IQR(x)
five <- quantile(x, c(0, .25, .5, .75, 1))

cat("Mean:", mean_val, "\n")
## Mean: 117.78
cat("Median:", median_val, "\n")
## Median: 116.5
cat("Standard Deviation:", sd_val, "\n")
## Standard Deviation: 46.28187
cat("Variance:", var_val, "\n")
## Variance: 2142.012
cat("Range:", range_val, "\n")
## Range: 20 230
cat("IQR:", iqr_val, "\n")
## IQR: 27.75
cat("Five-number summary:", five, "\n")
## Five-number summary: 20 100 116.5 127.75 230

3. Visualization

Histogram + Density

df <- data.frame(x = x)
ggplot(df, aes(x)) +
  geom_histogram(aes(y = ..density..), bins = 25, color = "black") +
  geom_density(linewidth = 1) +
  labs(title = "Histogram with Density of Daily Phone Pickups",
       x = "Daily Phone Pickups", y = "Density") +
  theme_minimal()

ggsave("hist.png", dpi = 300, width = 6, height = 4)

Boxplot

ggplot(df, aes(y = x)) +
  geom_boxplot(color = "black") +
  labs(title = "Boxplot of Daily Phone Pickups", x = "", y = "Daily Pickups") +
  theme_minimal()

ggsave("box.png", dpi = 300, width = 6, height = 4)

Violin + Dot Plot

ggplot(df, aes(x = "", y = x)) +
  geom_violin(trim = FALSE) +
  geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.6) +
  labs(title = "Violin + Dot Plot of Daily Phone Pickups", x = "", y = "Daily Pickups") +
  theme_minimal()

ggsave("violin.png", dpi = 300, width = 6, height = 4)

4. Outlier Detection

Z-Score Method

z_max <- (max(x) - mean(x)) / sd(x)
z_min <- (min(x) - mean(x)) / sd(x)
cat("Z-Score for Max:", z_max, "\n")
## Z-Score for Max: 2.424707
cat("Z-Score for Min:", z_min, "\n")
## Z-Score for Min: -2.112706
if (abs(z_max) > 3 | abs(z_min) > 3) {
  cat("Potential outlier(s) detected using the Z-score rule (|z| > 3).\n")
} else {
  cat("No extreme outliers detected via Z-score method.\n")
}
## No extreme outliers detected via Z-score method.

IQR Fences

Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
I <- IQR(x)
LF <- Q1 - 1.5 * I
UF <- Q3 + 1.5 * I
cat("Lower Fence:", LF, "\nUpper Fence:", UF, "\n")
## Lower Fence: 58.375 
## Upper Fence: 169.375
which(x < LF | x > UF)
##  [1]  3  6 13 14 18 19 21 22 23 26 32 34 40 41 48
outliers <- x[x < LF | x > UF]
outliers
##  [1] 170 175 190 230  55  23  20  45 200 189 184 190  43  22 198

5. Shape and Interpretation

if (mean_val > median_val) {
  shape <- "Right-skewed (positive)"
} else if (mean_val < median_val) {
  shape <- "Left-skewed (negative)"
} else {
  shape <- "Approximately symmetric"
}
cat("Distribution Shape:", shape, "\n")
## Distribution Shape: Right-skewed (positive)

Interpretation:
- Shape: Determined by comparing mean vs median and histogram pattern.
- Outliers: Identified by IQR or Z-score methods.
- Impact: Outliers may slightly raise the mean and stretch the right tail.
- Center & Spread: Mean ≈ Median → balanced distribution; SD & IQR show variability of phone pickups among students.


6. Save Outputs for Submission

write.csv(data, "project2_data.csv", row.names = FALSE)