Exam_1

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(modeest)
library(ggpubr)

## Loading required package: ggplot2

library(ggplot2)
library(BSDA)

## Loading required package: lattice

## 
## Attaching package: 'BSDA'

## The following object is masked from 'package:datasets':
## 
##     Orange

# NUMBER 3: CLEANING DATA

# Birthdate of favorite celebrity (Nick Jonas) is September 16, 1992, born in Dallas, TX
# CSV file looking at the whether it was a sunny day of every day of the week during Jonas' birthweek, in addition to the weather during the week of 9/16/02, 9/16/12, 9/16/22, meaning I am taking a look at weather differences over 10 year increments
# 1 indicates that it was a sunny day, 2 indicates that it was not a sunny day

weather <- read.csv("/Users/yahavmanor/Downloads/Exam_1_Sunny - Sheet1.csv")
weather

##   Day.of.Week Sunny1992 Sunny2002 Sunny2012 Sunny2022
## 1      Monday         1         1         1         1
## 2     Tuesday         1         1         1         1
## 3   Wednesday         1         2         1         1
## 4    Thursday         1         1         1         1
## 5      Friday         1         2         1         1
## 6   Saturday          2         1         1         2
## 7      Sunday         1         1         2         2

#Cleaning data

remove_comma_1992 <- gsub(",", "", weather$Sunny1992)
remove_comma_2002 <- gsub(",", "", weather$Sunny2002)
remove_comma_2012 <- gsub(",", "", weather$Sunny2012)
remove_comma_2022 <- gsub(",", "", weather$Sunny2022)
  
#Checking all data is numeric

is.numeric(weather$Sunny1992)

## [1] TRUE

is.numeric(weather$Sunny2002)

## [1] TRUE

is.numeric(weather$Sunny2012)

## [1] TRUE

is.numeric(weather$Sunny2022)

## [1] TRUE

#Checking all data does not have missing values
is.na(weather$Sunny1992)

## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE

is.na(weather$Sunny2002)

## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE

is.na(weather$Sunny2012)

## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE

is.na(weather$Sunny2022)

## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE

# NUMBER 3: HISTOGRAMS

# Histogram of the weather in each 10 year increment (1992, 2002, 2012, and 2022)

# This data is a ratio dataset, as there is a "true zero", which is "no sunny weather", which is represented as a "2", while sunny weather is presented as a "1" in the table

data <- rnorm(1000, mean = 50, sd = 10)

hist(weather$Sunny1992, breaks = 2, main = "Weather During the Week of 9/16/92", xlab = "Sunny", ylab = "Frequency", freq=FALSE)

#Overlay of a normal curve
lines(density(weather$Sunny1992, na.rm="TRUE"), col = "red", lwd =5)

hist(weather$Sunny2002, breaks = 2, main = "Weather During the Week of 9/16/02", xlab = "Sunny", ylab = "Frequency", freq=FALSE)

#Overlay of a normal curve
lines(density(weather$Sunny2002, na.rm="TRUE"), col = "blue", lwd =5)

hist(weather$Sunny2012, breaks = 2, main = "Weather During the Week of 9/16/12", xlab = "Sunny", ylab = "Frequency", freq=FALSE)

#Overlay of a normal curve
lines(density(weather$Sunny2012, na.rm="TRUE"), col = "yellow", lwd =5)

hist(weather$Sunny2022, breaks = 2, main = "Weather During the Week of 9/16/22", xlab = "Sunny", ylab = "Frequency", freq=FALSE)

#Overlay of a normal curve
lines(density(weather$Sunny2022, na.rm="TRUE"), col = "green", lwd =5)

# NUMBER 3: COMPARING MEAN, MEDIAN, AND MODE

# Comparisons of the mean, median, and mode of the sunny weather each week, on the weeks of September 16th over four 10 year increments

mean1992 <- mean(weather$Sunny1992)
mean1992

## [1] 1.142857

mean2002 <- mean(weather$Sunny2002)
mean2002

## [1] 1.285714

mean2012 <- mean(weather$Sunny2012)
mean2012

## [1] 1.142857

mean2022 <- mean(weather$Sunny2022)
mean2022

## [1] 1.285714

# Means are fairly close to each other (1.14, 1.28, 1.14, 1.28), indicating some normality in the distribution of sunny weather

median1992 <- median(weather$Sunny1992)
median1992

## [1] 1

median2002 <- median(weather$Sunny2002)
median2002

## [1] 1

median2012 <- median(weather$Sunny2012)
median2012

## [1] 1

median2022 <- median(weather$Sunny2022)
median2022

## [1] 1

# Medians are all 1, indicating some normality in the distribution of sunny weather

mode1992 <- mfv(weather$Sunny1992)
mode1992

## [1] 1

mode2002 <- mfv(weather$Sunny2002)
mode2002

## [1] 1

mode2012 <- mfv(weather$Sunny2012)
mode2012

## [1] 1

mode2022 <- mfv(weather$Sunny2022)
mode2022

## [1] 1

# Modes are all 1, indicating some normality in the distribution of sunny weather

# Overall, since the numbers for mean, median, and mode are roughly the same, I would argue that the data over each 10 year increment is normally distributed

# NUMBER 3: COMPARING TESTS FOR NORMALITY

# Conducting shapiro tests to analyze normality of data; data is normal if p-value is > 0.05 (otherwise known as not statistically significant from normal distribution)

# Null hypothesis: Data is normally distributed across all 4 10-year increments
# Alternative hypothesis: Data is not normally distributed across all 4 10-year increments

shapiro.test(weather$Sunny1992)

## 
##  Shapiro-Wilk normality test
## 
## data:  weather$Sunny1992
## W = 0.45297, p-value = 4.136e-06

shapiro.test(weather$Sunny2002)

## 
##  Shapiro-Wilk normality test
## 
## data:  weather$Sunny2002
## W = 0.6004, p-value = 0.0002752

shapiro.test(weather$Sunny2012)

## 
##  Shapiro-Wilk normality test
## 
## data:  weather$Sunny2012
## W = 0.45297, p-value = 4.136e-06

shapiro.test(weather$Sunny2022)

## 
##  Shapiro-Wilk normality test
## 
## data:  weather$Sunny2022
## W = 0.6004, p-value = 0.0002752

# Since each p-value is < 0.05, we reject the null hypothesis, meaning there is some evidence to prove that the data is not normally distributed. It is also likely that because this is a binomial dataset, the shapiro test is less likely to determine normality of data, and it is more efficient to compare previous statiscal analyses (mean, median, and mode) to make conclusions about the normality of the data.

# NUMBER 3: DETERMINING IF WEATHER WAS UNUSUAL

# To determine if the weather during my favorite celebrity's birth week was unusual in comparison to data from the following 10-year increments, I have decided to use the binomial formula to determine the probability and likelihood of it being a sunny day (when the value is = 1) on any of the 7 days during Nick Jonas' birth week, as well as the following 4 10-year increments.

count1992 <- sum(weather$Sunny1992 == 1, na.rm = TRUE) # counts number of sunny days in weeek of 9/16/92

p1992 <- count1992/7 #calculates probability

p1992 # this is the p variable that will be used in the binomial formula

## [1] 0.8571429

count2002 <- sum(weather$Sunny2002 == 1, na.rm = TRUE) # counts number of sunny days in weeek of 9/16/02

p2002 <- count2002/7 #calculates probability


count2012 <- sum(weather$Sunny2012 == 1, na.rm = TRUE) # counts number of sunny days in weeek of 9/16/12

p2012 <- count2012/7 #calculates probability


count2022 <- sum(weather$Sunny2022 == 1, na.rm = TRUE) # counts number of sunny days in weeek of 9/16/22

p2022 <- count2022/7 #calculates probability

# Now, I am going to determine if each week had more or less sunny weather than Nick Jonas' birthweek

k1 <- 0

if(p2002 >= p1992)
{
  k1 <- k1 + 1
}

if(p2012 >= p1992)
{
  k1 <- k1 + 1
}

if(p2022 >= p1992)
{
  k1 <- k1 + 1
}

k1 # this is the k variable that will be used in the binomial formula

## [1] 1

# Since the sample is 4 different 10-year increments, n = 4

#Now, I will use the binomial formula to determine if the weather is abnormal
x = dbinom(1, 4, 0.86) # k, n, p
x

## [1] 0.00943936

# Since the output using the binomial forumla is close to 0 (0.009), it is safe to assume a low probability and conclude that the sunny weather during Nick Jonas' birth week was statistically unusual in comparison to the weather in the same location for the next 3 decades. This conclusion makes sense, as rain levels and cloud presence in Dallas has increased over time (based on data from climatecheck.com), which indicates that it is likely to see a difference in sunny weather from 1992 until today.

# NUMBER 4: PLOTTING EXAM DATA

examdata <- read.csv("/Users/yahavmanor/Downloads/Grades_vs_slept_hours_data - Sheet1.csv")
examdata

##   Hours.slept Exam.grade
## 1           6         85
## 2           9         94
## 3           4         71
## 4           5         90
## 5           8         82

# This file's data shows the impact of hours slept overnight on exam grades for 5 different students. Since hours slept likely impacts exam grades, these variables are dependent on each other. The "hours slept" variable would be considered ratio data since it is numerical and there exists a true zero (0 hours of sleep means no sleep). However, exam grade is an example of interval data, as there is no true zero, for a 0% on the exam does not mean "exam was not taken", it means that the exam was taken and that the student received a 0 on it.

hours_slept <- examdata$Hours.slept
exam_grade <- examdata$Exam.grade

# Histogram of just hours slept data
hist(examdata$Hours.slept, breaks = 10, main = "Distribution of Hours Slept", xlab = "Hours Slept", freq=FALSE)

# Bar graph of data
ggplot(examdata, aes(x = factor(hours_slept), y = exam_grade)) +
  geom_col(fill = "yellow", color = "black") +
  labs(title = "Exam Grade vs. Hours Slept", x = "Hours Slept (hrs)", y = "Exam Grade (%)")

#Line graph of data
ggplot(examdata, aes(x = hours_slept, y = exam_grade)) +
  geom_line(color = "blue") +
  geom_point(color = "red")

  labs(title = "Exam Grade vs. Hours Slept", x = "Hours Slept (hrs)", y = "Exam Grade (%)")

## $x
## [1] "Hours Slept (hrs)"
## 
## $y
## [1] "Exam Grade (%)"
## 
## $title
## [1] "Exam Grade vs. Hours Slept"
## 
## attr(,"class")
## [1] "labels"

#Scatter plot of data
  ggplot(examdata, aes(x = hours_slept, y = exam_grade)) +
  geom_point(color = "green") +
  labs(title = "Exam Grade vs. Hours Slept", x = "Hours Slept (hrs)", y = "Exam Grade (%)")

# NUMBER 5: MEDFORD, MA VS ALL OF MA FATAL PEDESTRIAN CAR CRASHES (USING Z-SCORES)

ma_crashes <- read.csv("/Users/yahavmanor/Downloads/MassDOT_FatalPedestrianCrashes_2022 - export_3_12_2025_10_04_53.csv")

# Using data from MassDOT website to find total number of fatal pedestrian car crashes in 2022, split up by town/city. Other values in the table show "crash number", "latitude", and "longitude", which are values that are not relevant to the following statistical analysis. The number of fatal pedestrian car crashes is an example of ratio data, as it is numerical and there exists a true zero (0 fatal pedestrian crashes means no pedestrian died from a car crash in that town/city)

ma_crashes

##    Crash.Number   City.Town.Name Fatal.Ped.Crash Crash.Year Latitude Longitude
## 1       5052191        BILLERICA               1       2022 42.55844 -71.29558
## 2       5059678       FRAMINGHAM               3       2022 42.29702 -71.48331
## 3       5060320        WAKEFIELD               1       2022 42.47664 -71.06224
## 4       5060322        WORCESTER               7       2022 42.24225 -71.79738
## 5       5063796         YARMOUTH               1       2022 41.64414 -70.24139
## 6       5063797        STOUGHTON               1       2022 42.15833 -71.06739
## 7       5066618           DEDHAM               2       2022 42.23276 -71.18415
## 8       5068454            SALEM               1       2022 42.49322 -70.93434
## 9       5068542          HOLYOKE               1       2022 42.19449 -72.61555
## 10      5074933           WOBURN               1       2022 42.47982 -71.14877
## 11      5074937          AMHERST               1       2022 42.38504 -72.52661
## 12      5076013           SHARON               2       2022 42.09490 -71.22132
## 13      5077522         ABINGTON               1       2022 42.14531 -70.95434
## 14      5079893        CAMBRIDGE               2       2022 42.37314 -71.08100
## 15      5080806           QUINCY               2       2022 42.27274 -71.02775
## 16      5089302          WALTHAM               1       2022 42.37705 -71.25180
## 17      5090710           BOSTON               9       2022 42.35028 -71.06025
## 18      5093261      SPRINGFIELD               3       2022 42.11580 -72.58886
## 19      5095443          GARDNER               2       2022 42.57131 -72.00416
## 20      5100657           LOWELL               2       2022 42.64944 -71.30412
## 21      5104030         BROCKTON               3       2022 42.05635 -71.03705
## 22      5107972          TAUNTON               1       2022 41.89359 -71.10611
## 23      5108089          ANDOVER               1       2022 42.66028 -71.13676
## 24      5113576      NORTHAMPTON               1       2022 42.32133 -72.63152
## 25      5114947        HAVERHILL               2       2022 42.75894 -71.09491
## 26      5119045          WALPOLE               1       2022 42.14742 -71.20171
## 27      5120156      NEW BEDFORD               1       2022 41.66796 -70.92116
## 28      5122131         MEDFIELD               1       2022 42.17668 -71.31343
## 29      5134732      EASTHAMPTON               1       2022 42.27930 -72.67127
## 30      5135529           SUTTON               1       2022 42.15125 -71.75934
## 31      5141017          HALIFAX               1       2022 41.99737 -70.83691
## 32      5141928           SAUGUS               1       2022 42.45746 -71.02399
## 33      5143471          WEBSTER               1       2022 42.05000 -71.87855
## 34      5146274          SWANSEA               1       2022 41.74971 -71.21651
## 35      5148720       STURBRIDGE               1       2022 42.11238 -72.12926
## 36      5149764          BEDFORD               1       2022 42.48636 -71.25888
## 37      5150457       LEOMINSTER               1       2022 42.49198 -71.72750
## 38      5151664         ROCKPORT               1       2022 42.65557 -70.62460
## 39      5151665         CHICOPEE               5       2022 42.15713 -72.59853
## 40      5161195       PLAINVILLE               1       2022 42.03789 -71.31303
## 41      5163638            ACTON               1       2022 42.49340 -71.41629
## 42      5164740 WEST SPRINGFIELD               2       2022 42.09064 -72.62650
## 43      5166405           MONSON               1       2022 42.10296 -72.31933
## 44      5167711        TEWKSBURY               1       2022 42.63948 -71.23795
## 45      5181321       FOXBOROUGH               1       2022 42.06298 -71.21882
## 46      5184462     TYNGSBOROUGH               1       2022 42.68981 -71.44360
## 47      5184562          MEDFORD               1       2022 42.41376 -71.07936
## 48      5189227       GREENFIELD               1       2022 42.60680 -72.58849
## 49      5193700          DOUGLAS               1       2022 42.07138 -71.73507
## 50      5243043        SOUTHWICK               1       2022 42.02472 -72.78799
## 51      5244826          SHIRLEY               1       2022 42.54420 -71.65575
## 52      5252599          PEABODY               1       2022 42.52238 -70.92462
## 53      5259613       SHREWSBURY               1       2022 42.27721 -71.69750

# In 2022, there was one fatal pedestrian car crash (referred to as an "incident") in Medford, MA (based on ma_crashes)

medford_fatalities <- 1

# NUMBER 5: CHECKING FOR NORMALITY AND CLEANING DATA

# Cleaning data:

# Removing commas

remove_comma <- gsub(",", "", ma_crashes$Fatal.Ped.Crash)
  
#Checking all data is numeric

numeric <- is.numeric(weather$ma_crashes$Fatal.Ped.Crash)

#Checking all data does not have missing values
is.na(weather$ma_crashes$Fatal.Ped.Crash)

## logical(0)

# Characterizing the normality of the dataset of fatal pedestrian car crashes in Massachusetts in 2022

mean(ma_crashes$Fatal.Ped.Crash)

## [1] 1.603774

median(ma_crashes$Fatal.Ped.Crash)

## [1] 1

mfv(ma_crashes$Fatal.Ped.Crash)

## [1] 1

hist(ma_crashes$Fatal.Ped.Crash, breaks = 10, main = "Fatal Pedestrian Crashes in MA in 2022", xlab = "Number of Crashes", ylab = "Frequency", freq=FALSE)
lines(density(ma_crashes$Fatal.Ped.Crash, na.rm="TRUE"), col = "green", lwd =5)

# Data is skewed right, indicating a lack of normality in the data. This is likely because it is much more likely for towns to have around the same number of fatal pedestrian crashes (around 2) since proportionally, most towns/cities in the table have around the same average number of residents. This is with the exception of more major cities like Boston and Worcester.

# Null hypothesis: Data is normally distributed 
# Alternative hypothesis: Data is not normally distributed 

shapiro.test(ma_crashes$Fatal.Ped.Crash)

## 
##  Shapiro-Wilk normality test
## 
## data:  ma_crashes$Fatal.Ped.Crash
## W = 0.46253, p-value = 1.189e-12

# Differing statistical analyses (between mean, median, and mode), the skewed data in the histogram, as well as a p-value of less than 0.05 from the shapiro test, indicates that the data is not normally distributed.

# Calculating the z-score to determine if the number of incidents in Medford is significantly higher or lower than expected. If Z < 2 or Z < -2, then it can be determined that the number of incidents in Medford is abnormal (this uses a confidence level of 95%). If -2 < Z < 2, then it appears that the number of incidents that occur in Medford is within the normal range in comparison to other towns and cities in Massachusetts.

mean_fatped_ma <- mean(ma_crashes$Fatal.Ped.Crash)

sd_fatped_ma <- sd(ma_crashes$Fatal.Ped.Crash)
sd_fatped_ma

## [1] 1.497942

z_score <- ((medford_fatalities - mean_fatped_ma)/sd_fatped_ma)
z_score

## [1] -0.4030686

# Since the z-score is -0.4, it can be concluded that Medford is within the normal range of fatal pedestrian car crashes in 2022 in comparison to other towns/cities that had at least one fatal pedestrian car crash in Massachusetts in the same year

# NUMBER 5: MA VS NH FATAL PEDESTRIAN CAR CRASHES (USING Z-TEST)

# Conducting a z-test comparing data from your state with the data for NH

# Data online does not show a breakdown of fatal pedestrian car crashes in various towns/cities across NH in 2022, so I decided to use data of the mean and standard deviation that was available via the NHTSA and NHDOT websites. 

# Null hypothesis: There is no significant difference between the mean number of incidents between Massachusetts and New Hampshire
# Alternative hypothesis: There is a statistically significant difference between the means

mean_fatped_nh <- 1.15

sd_fatped_nh <- 1.7

n_ma <- 298 #total towns in MA
n_nh <- 221 #total towns in NH

# Generating vectors of numbers using the data collected from DOT websites for MA and NH in order to conduct z test

data_ma <- rnorm(n_ma, mean = mean_fatped_ma, sd = sd_fatped_ma)
data_nh <- rnorm(n_nh, mean = mean_fatped_nh, sd = sd_fatped_nh)

z.test(x = data_ma, 
       y = data_nh, 
       sigma.x = sd_fatped_ma, 
       sigma.y = sd_fatped_nh, 
       alternative = "two.sided",
       conf.level = 0.95)

## 
##  Two-sample z-Test
## 
## data:  data_ma and data_nh
## z = 6.0654, p-value = 1.316e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.5893317 1.1520368
## sample estimates:
## mean of x mean of y 
## 1.7672992 0.8966149

# P-value of 0.01 (less than 0.05) indicates that the difference in the data of fatal pedestrian car crashes between MA and NH is statistically significant, which means we reject the null hypothesis. This means that there exists a difference in fatal pedestrian crashes between MA and NH, which could be a result of the fact that NH is the "safest state" in the country when it comes to pedestrian vehicle accidents. Massachusetts, on the other hand, is only slightly higher on this list, as it is also considered to be a very safe place for pedestrians. It is likely that the data for Massachusetts was heavily skewed (it was not a normal distribution) because of the lack of even distribution between different towns in the state. The east side of MA, where the major city Boston is located, have significantly higher population rates, and, therefore, higher fatal pedestrian vehicle accident rates. Western MA is much more rural, which may indicate why there was such abnormal data. Because of all of this, it is likely that the results of the z test are slightly off due to the fact that MA's distribution is not even at all. A better statistical analysis can be conducted by setting a minimum population requirement for the town/city to be considered as a part of the ma_fatalities dataset.

# NUMBER 6: NORMAL INDEPENDENT SAMPLE T-TEST

# Generating random datasets for the exam scores of two different classes (meaning they are not dependent of each other, therefore the data is independent), based on some data provided including the mean and sd. This data is considered interval data, as there is no true zero even though it is numerical.
class_A <- rnorm(30, mean = 75, sd = 10)  
class_B <- rnorm(30, mean = 80, sd = 10) 

# First, testing for normality of data to determine whether a parametric or nonparametric test should be conducted
shapiro.test(class_A) #p-value of 0.4 indicates normal distribution

## 
##  Shapiro-Wilk normality test
## 
## data:  class_A
## W = 0.98234, p-value = 0.8838

shapiro.test(class_B) #p-value of 0.4 indicates normal distribution

## 
##  Shapiro-Wilk normality test
## 
## data:  class_B
## W = 0.96961, p-value = 0.5285

# Therefore, the data is parametric. Next, I am generating a dataframe to be used in the independent sample t-test
data <- data.frame(
  Score = c(class_A, class_B),
  Class = rep(c("A", "B"), each = 30)
)

t.test(Score ~ Class, data = data, var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  Score by Class
## t = -0.40543, df = 58, p-value = 0.6867
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
##  -5.426918  3.598827
## sample estimates:
## mean in group A mean in group B 
##        78.53360        79.44765

# P-value of 0.057 indicates that the data is normally distributed, as we reject the alternative hypothesis in favor of the null. In the specific case of this example, this shows that there is no statistical significance in the grades on an exam from two different classes, which can conclude that both classes are learning similar content and that there are fair expectations for performance on the exam between the two classes.

# NUMBER 6: NORMAL PAIRED/DEPENDENT SAMPLE T-TEST

# Data shows grades for 7 different students in a calculus class on their practice exam scores (pre) and actual exam scores (post). Much like the example before, this data is considered interval data as it is numerical, though there exists no true zero. 

pre <- c(78, 82, 88, 94, 69, 75, 80)
post <- c(85, 84, 90, 98, 70, 78, 83)

# First, testing for normality of data to determine whether a parametric or nonparametric test should be conducted

shapiro.test(pre) #p-value of 0.98 indicates normal distribution

## 
##  Shapiro-Wilk normality test
## 
## data:  pre
## W = 0.98632, p-value = 0.9844

shapiro.test(post) #p-value of 0.9575 indicates normal distribution

## 
##  Shapiro-Wilk normality test
## 
## data:  post
## W = 0.97958, p-value = 0.9575

t.test(pre, post, paired = TRUE)

## 
##  Paired t-test
## 
## data:  pre and post
## t = -4.2603, df = 6, p-value = 0.00532
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -4.947971 -1.337743
## sample estimates:
## mean difference 
##       -3.142857

# P-value of 0.005 indicates that we should reject the null hypothesis, which states that there is no true difference between the means of each sample. Therefore, we can conclude that there is a statistically significant difference between the two groups. In the case of this example, this proves that taking a practice exam did make a difference on the actual exam scores, indicating that practice exams may help these students perform better on actual exams.

# NUMBER 6: NONPARAMETRIC INDEPENDENT TEST (Mann-Whitney)

#Data is generated to show customer review ratings of two different stores, a record store and a phone store. This data is considered interval data as it is numerical (ratings 1 through 5) though there exists no true zero (no such thing as a 0 rating)
record_store <- sample(1:5, 30, replace = TRUE) 
phone_store <- sample(1:5, 30, replace = TRUE)


# First, testing for normality of data to determine whether a parametric or nonparametric test should be conducted
shapiro.test(record_store) #p-value of 0.01 indicates that data is not normal, so a nonparametric test needs to be used

## 
##  Shapiro-Wilk normality test
## 
## data:  record_store
## W = 0.88599, p-value = 0.003881

shapiro.test(phone_store) #p-value of 0.001 indicates that data is not normal, so a nonparametric test needs to be used

## 
##  Shapiro-Wilk normality test
## 
## data:  phone_store
## W = 0.90165, p-value = 0.009219

# Since the data is independent (reviews of one store does not impact the other) and the data is not normal (non parametric), a Mann-Whitney test needs to be conducted 

# Generating a dataframe to be used in the Mann-Whitney test
data <- data.frame(
  Rating = c(record_store, phone_store),
  Store = rep(c("A", "B"), each = 30)
)

# Perform Mann-Whitney U test
wilcox.test(Rating ~ Store, data = data)

## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Rating by Store
## W = 417, p-value = 0.6228
## alternative hypothesis: true location shift is not equal to 0

# P-value of 0.52 (> 0.05) indicates that we fail to reject the null hypothesis, which states that the distributions of the two samples being compared are identical. In this example, this finding tells us that there is no statistically significant difference in customer reviews between the record store and the phone store. This could be justified by the fact that potentially the two stores have similar customer-employee interaction styles, like friendliness and comfort.

# NUMBER 6: NONPARAMETRIC DEPENDENT TEST (Wilcoxon signed-rank)

#Data is generated to show people's weight (lbs) before and after a diet program (dependent). This data can be considered to be ratio data because there exists a true zero of 0 lbs, which makes the ratio between numbers meaningful. 

before_weight <- sample(75:100, 20, replace = TRUE)
after_weight <- before_weight - sample(1:7, 20, replace = TRUE)

# First, testing for normality of data to determine whether a parametric or nonparametric test should be conducted
shapiro.test(before_weight) #p-value of 0.04 indicates that data is not normal, so a nonparametric test needs to be used

## 
##  Shapiro-Wilk normality test
## 
## data:  before_weight
## W = 0.95632, p-value = 0.4733

shapiro.test(after_weight) #p-value of 0.02 indicates that data is not normal, so a nonparametric test needs to be used

## 
##  Shapiro-Wilk normality test
## 
## data:  after_weight
## W = 0.96951, p-value = 0.7444

# Since the data is dependent and the data is not normal (non parametric), a Wilcoxon signed-rank test needs to be conducted 

# Generating a dataframe to be used in the test
data <- data.frame(
  Before = before_weight,
  After = after_weight
)

# Perform Wilcoxon signed-rank test
wilcox.test(data$Before, data$After, paired = TRUE)

## Warning in wilcox.test.default(data$Before, data$After, paired = TRUE): cannot
## compute exact p-value with ties

## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  data$Before and data$After
## V = 210, p-value = 8.966e-05
## alternative hypothesis: true location shift is not equal to 0

# P-value < 0.05 indicates that the we reject the null hypothesis, which states that the distributions of the two samples being compared are identical. In this example, this statistically significant difference could indicate that the diet plan works in impacting weight of an individual before and after going on the plan.

Exam_1

Yahav Manor

2025-03-11