Fuglie_C_HW1.R

#### Page 6 # 3 ####
cat("(a) Populations: cars produced by first shift; cars produced by second shift.\n")

## (a) Populations: cars produced by first shift; cars produced by second shift.

cat("(b) Hypothetical? Yes conceptual process populations over time.\n")

## (b) Hypothetical? Yes conceptual process populations over time.

cat("(c) Characteristic: nonconformances per car; compare the means mu1 and mu2.\n")

## (c) Characteristic: nonconformances per car; compare the means mu1 and mu2.

#### Page 10 # 6 ####
# make 1000 fake client IDs
clients <- 1:1000

          #  100 of them at random
set.seed(1)
sample_clients <- sample(clients, 100)

          #  how many we got
length(sample_clients)

## [1] 100

head(sample_clients)

## [1] 836 679 129 930 509 471

##### Page 10 # 8 ####
# make fake production: 30% A, 70% B
production <- c(rep("A", 300), rep("B", 700))

# simple random sample of 50
set.seed(1)
srs <- sample(production, 50)
table(srs)

## srs
##  A  B 
## 10 40

# coin flip way: flip coin 50 times, heads=A, tails=B
set.seed(1)
coin_flips <- sample(c("A","B"), 50, replace=TRUE)

# now actually pick 1 item from that facility each time
coin_sample <- c()
for(i in 1:50){
  if(coin_flips[i]=="A"){
    pick <- sample(which(production=="A"), 1)
    coin_sample <- c(coin_sample, production[pick])
  } else {
    pick <- sample(which(production=="B"), 1)
    coin_sample <- c(coin_sample, production[pick])
  }
}
table(coin_sample)

## coin_sample
##  A  B 
## 27 23

# coin sample ends up closer to 50/50
# while real production is 30/70. so it's not a true srs.

#### Page 13 # 3 ####

# (a) Univariate example: total nonconformances (engine + transmission together)
total_nonconf <- c(2, 0, 1, 3, 2)
print("Total nonconformances (univariate)")

## [1] "Total nonconformances (univariate)"

print(total_nonconf)

## [1] 2 0 1 3 2

# (b) Check if it's quantitative
print("Is total_nonconf numeric?")

## [1] "Is total_nonconf numeric?"

print(is.numeric(total_nonconf))   # TRUE means quantitative

## [1] TRUE

# (c) Statistical population = all BMW cars produced at Graz plant
# Here we just pretend there are 500 cars in the population
N <- 500
cars <- 1:N
set.seed(1)
sample_cars <- sample(cars, 10)    # pick 10 cars randomly
print("Random sample of cars from population:")

## [1] "Random sample of cars from population:"

print(sample_cars)

##  [1] 324 167 129 418 471 299 270 466 187 307

# (d) If engine + transmission recorded separately → bivariate
engine <- c(1,0,1,2,1)
trans  <- c(1,0,0,1,1)
car_data <- data.frame(engine, trans)
print("Engine and transmission nonconformances (bivariate):")

## [1] "Engine and transmission nonconformances (bivariate):"

print(car_data)

##   engine trans
## 1      1     1
## 2      0     0
## 3      1     0
## 4      2     1
## 5      1     1

#### Page 20 # 1 ####
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #had to do this because on personal computer

cs<- read.table("Concr.Strength.1s.Data.txt", header = TRUE)

# see what’s inside
names(cs)

## [1] "Str"

head(cs)

##     Str
## 1 43.49
## 2 43.09
## 3 47.68
## 4 44.14
## 5 44.23
## 6 41.52

# grab the strength column 
strength <- cs$Str

# histogram
hist(strength, freq = FALSE,
     main = "Histogram of Concrete Strength",
     xlab = "Compressive Strength (MPa)")

# add density curve
lines(density(strength))

# stem-and-leaf
stem(strength)

## 
##   The decimal point is at the |
## 
##   41 | 5
##   42 | 39
##   43 | 1445788
##   44 | 122357
##   45 | 1446
##   46 | 00246
##   47 | 3577
##   48 | 36
##   49 | 3

#### Page 20 # 4 ####

tempdata <- read.table("Temp.Long.Lat.txt", header = TRUE)

# keep just the numeric columns we need
numdata <- tempdata[, c("JanTemp", "Lat", "Long")]

library(plotly)

## Loading required package: ggplot2

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

# basic 3D scatter (click + drag to rotate)
plot_ly(
  data = numdata,
  x = ~Long, y = ~Lat, z = ~JanTemp,
  type = "scatter3d",
  mode = "markers"
) %>%
  layout(
    title = "3D: January Min Temp vs Latitude & Longitude",
    scene = list(
      xaxis = list(title = "Longitude"),
      yaxis = list(title = "Latitude"),
      zaxis = list(title = "Avg Jan Min Temp (°F)")
    )
  )

#### Page 21 #7 ####
# read the data 
bd <- read.table("SpeedStopCarTruck.txt", header = TRUE)

# color by vehicle type
cols <- ifelse(bd$Auto == "Car", "blue", "red")

# scatter plot
plot(bd$Speed, bd$StopDist,
     col = cols, pch = 19,
     xlab = "Speed (mph)",
     ylab = "Stopping Distance (ft)",
     main = "Stopping Distance vs Speed (Cars vs Trucks)")

# legend
legend("topleft",
       legend = c("Car", "Truck"),
       col = c("blue", "red"),
       pch = 19, bty = "n")

#### Page 21 #13 ####

head(airquality)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

# scatterplot matrix of first 5 variables (Ozone, Solar, Wind, Temp, Month)
pairs(airquality[1:5],
      main = "Scatterplot Matrix of Ozone, Solar, Wind, Temp, Month")

# (a) do ozone levels go up on hot days?
# yeah looks like ozone and temp go up together (positive trend).
# so higher ozone is more likely on hot days.

# (b) what about windy days?
# plot shows ozone goes down when wind goes up.
# so nope, ozone is actually lower on windy days.

# (c) what about solar radiation?
# ozone kinda increases when solar radiation is higher.
# so more sunlight = more ozone.

# (d) which month is worst?
# looking at ozone vs month, july (month 7) is the highest.
# so july is when ozone peaks.

#### Page 22 #17 ####

lw <- read.table("ReasonsLateForWork.txt", sep = ",", header = TRUE)

print(lw)

##        Reason Percent
## 1     Traffic    34.2
## 2  Child Care    28.0
## 3  Pub Transp    17.5
## 4     Weather    12.2
## 5   Overslept     8.9
## 6       Other     3.4

lw <- lw[order(lw$Percent, decreasing = TRUE), ]

# colors
bar_colors <- rainbow(nrow(lw))

# basic bar plot
barplot(lw$Percent,
        names.arg = lw$Reason,
        col = bar_colors,
        width = 0.8,
        las = 2,                 
        main = "Reasons Late for Work — Bar Chart",
        xlab = "",
        ylab = "Percent")

# (a) PIE CHART 
pie(lw$Percent,
    labels = paste0(lw$Reason, " (", lw$Percent, "%)"),
    col = bar_colors,
    main = "Reasons Late for Work — Pie Chart")

# (b) PARETO CHART (bars + cumulative percentage line) 
xpos <- barplot(lw$Percent,
                names.arg = lw$Reason,
                col = bar_colors,
                width = 0.8,
                las = 2,
                ylim = c(0, max(100, sum(lw$Percent))),  # keep room for line
                main = "Pareto Chart: Reasons Late for Work",
                xlab = "",
                ylab = "Percent")

# cumulative percentages
cum_perc <- cumsum(lw$Percent)

lines(x = xpos, y = cum_perc, col = "red", lwd = 2)
points(x = xpos, y = cum_perc, col = "red", pch = 19)
abline(h = 80, lty = 2, col = "gray")  # 80% reference line (optional)

axis(side = 4, at = seq(0, 100, by = 20), labels = paste0(seq(0, 100, by = 20), "%"))
mtext("Cumulative %", side = 4, line = 2)

legend("topright",
       legend = c("Bar height = Percent", "Cumulative %"),
       col = c("gray40", "red"),
       pch = c(15, 19),
       lty = c(NA, 1),
       bty = "n")

#### Page 29-31 #1 ####

# (a) sample mean
xbar <- 37
xbar   # print it

## [1] 37

# (b) sample standard deviation
s <- 18
s     # print it

## [1] 18

# (c) sample proportion
phat <- 0.72
phat  # print

## [1] 0.72

#### Page 29-31 #4 ####

# read the data
cs <- read.table("Concr.Strength.1s.Data.txt", header = TRUE)

#  column names
names(cs)

## [1] "Str"

Str <- cs$Str

# proportion with strength <= 44
prop_leq_44 <- sum(Str <= 44) / length(Str)
prop_leq_44   # print it

## [1] 0.3125

# proportion with strength >= 47
prop_geq_47 <- sum(Str >= 47) / length(Str)
prop_geq_47   # print it

## [1] 0.21875

#### Page 29-31 #6 ####

# fake data:
population <- c(65,70,72,68,75,80,77,74,69,73)

# function to take one random sample of size 50, return mean + variance
one_sample <- function() {
  samp <- sample(population, size = 50, replace = TRUE)  # random sample
  xbar <- mean(samp)   # sample mean
  s2   <- var(samp)    # sample variance
  return(c(xbar, s2))
}

# repeat 5 times
results <- replicate(5, one_sample())

# transpose so each row = one sample
results <- t(results)

# show the 5 pairs (xbar, s^2)
colnames(results) <- c("xbar", "s2")
results

##       xbar       s2
## [1,] 73.16 18.21878
## [2,] 73.20 22.04082
## [3,] 71.84 13.93306
## [4,] 71.80 18.44898
## [5,] 71.82 17.78327

#### Page 29-31 #9 ####

# (a) population mean (mu) and variance (sigma^2) for a fair die {1,2,3,4,5,6}
vals <- 1:6
mu <- mean(vals)          # should be 3.5
sigma2 <- var(vals) * (length(vals)-1)/length(vals)  
# note: var() uses sample variance; multiply by (n-1)/n to get population variance
mu

## [1] 3.5

sigma2

## [1] 2.916667

# answer: mu = 3.5 ; sigma^2 = 35/12 ≈ 2.9166667

# (b) take a sample of size 100 with replacement; compute sample mean & variance
set.seed(1)               # just so we can all get the same example
x <- sample(1:6, 100, replace = TRUE)
xbar <- mean(x)
s2 <- var(x)              # sample variance
xbar

## [1] 3.52

s2

## [1] 3.34303

# comment: xbar should be near 3.5 and s2 near 2.9167, but not exact because of randomness.

# (c) sample proportions for outcomes 1..6
props <- table(x) / 100
props

## x
##    1    2    3    4    5    6 
## 0.19 0.18 0.12 0.15 0.15 0.21

#### Page 29-31 #16 ####
# Problem 16

# sample salaries (in $1000 units)
salaries <- c(152,169,178,179,185,188,195,196,198,203,204,209,210,212,214)

# (a) sample mean and variance
mean_salary <- mean(salaries)
var_salary  <- var(salaries)

cat("(a) Sample mean =", mean_salary, "\n")

## (a) Sample mean = 192.8

cat("(a) Sample variance =", var_salary, "\n\n")

## (a) Sample variance = 312.3143

# (b)(i) 
salaries_raise5000 <- salaries + 5
mean_raise5000 <- mean(salaries_raise5000)
var_raise5000  <- var(salaries_raise5000)

cat("(b)(i) Mean with $5000 raise =", mean_raise5000, "\n")

## (b)(i) Mean with $5000 raise = 197.8

cat("(b)(i) Variance with $5000 raise =", var_raise5000, "\n\n")

## (b)(i) Variance with $5000 raise = 312.3143

# (b)(ii) 5% raise for everyone
salaries_raise5pct <- salaries * 1.05
mean_raise5pct <- mean(salaries_raise5pct)
var_raise5pct  <- var(salaries_raise5pct)

cat("(b)(ii) Mean with 5% raise =", mean_raise5pct, "\n")

## (b)(ii) Mean with 5% raise = 202.44

cat("(b)(ii) Variance with 5% raise =", var_raise5pct, "\n")

## (b)(ii) Variance with 5% raise = 344.3265

#### Page 35 #2 ####
# Problem 2

t <- read.table("RobotReactTime.txt", header = TRUE)

# look at the column names
names(t)

## [1] "Time"  "Robot"

# take only Robot 1's times
t1 <- t$Time[t$Robot == 1]

# sort the times
t1_sorted <- sort(t1)

# (a) median and 25th / 75th percentiles
median_t1 <- median(t1_sorted)
q25 <- quantile(t1_sorted, 0.25)
q75 <- quantile(t1_sorted, 0.75)

cat("(a) Median =", median_t1, "\n")

## (a) Median = 30.55

cat("(a) 25th percentile =", q25, "\n")

## (a) 25th percentile = 29.6325

cat("(a) 75th percentile =", q75, "\n\n")

## (a) 75th percentile = 31.375

# (b) interquartile range (IQR)
iqr_t1 <- IQR(t1_sorted)
cat("(b) Interquartile range =", iqr_t1, "\n\n")

## (b) Interquartile range = 1.7425

# (c) percentile of the 19th ordered value
n <- length(t1_sorted)
value_19 <- t1_sorted[19]
percentile_19 <- 100 * 19 / n

cat("(c) 19th ordered value =", value_19, "\n")

## (c) 19th ordered value = 31.6

cat("(c) This value is at about the", percentile_19, "th percentile\n")

## (c) This value is at about the 86.36364 th percentile

#### Page 35 #3 ####
# Problem 3

t <- read.table("RobotReactTime.txt", header = TRUE)

# take only Robot 2's times
t2 <- t$Time[t$Robot == 2]

# (a) five number summary
cat("(a) Five number summary:\n")

## (a) Five number summary:

summary(t2)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   28.97   29.30   29.94   30.11   30.82   32.23

# (b) 90th percentile
p90 <- quantile(t2, 0.90)
cat("\n(b) 90th percentile =", p90, "\n")

## 
## (b) 90th percentile = 31.068

# (c) boxplot + check for outliers
boxplot(t2,
        main = "Boxplot of Robot 2 Reaction Times",
        ylab = "Reaction Time")

#### Page 35 #4a ####
# Problem 4

si <- read.table("SolarIntensAuData.txt", header = TRUE)

head(si)

##    SI
## 1 673
## 2 673
## 3 676
## 4 677
## 5 680
## 6 682

# (a) boxplot of solar intensity
boxplot(si$SI,
        main = "Boxplot of Solar Intensity Measurements",
        ylab = "Solar Intensity")

# (b) summary statistics (five-number summary + mean)
summary(si$SI)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   673.0   697.0   717.0   714.2   734.0   758.0

# (c) list outliers explicitly
outliers <- boxplot.stats(si$SI)$out
outliers

## integer(0)

Fuglie_C_HW1.R

wendy

2025-09-07