data <- read.csv("merge_wrangled_cleaned.csv")
forced_penetration_women<- data %>%
filter(ag_forced_penetration_age_num!=Inf) %>% # this is accounting for NAs
filter(gender=="Female") # considering women
var <- forced_penetration_women$ag_forced_penetration_age_num
### central tendency and distribution
var<-na.omit(var)
var<-as.numeric(as.character(var))
mean(var) #16.3004
## [1] 16.3004
var(var) #38.74274
## [1] 38.74274
sd(var) # 6.224366
## [1] 6.224366
### Testing for Normality (Informal Analytical Approach)
set.seed(3000)
data_normal<-rnorm(2944, mean=1313.264, sd=4332.279) # unsure if these parameters are meaningful
hist(data_normal)
hist(var) # visually, looks largely comparable, rougher than normal partly bc of small n.
#QQ Plots
qqnorm(data_normal)
qqnorm(var)
qqplot(data_normal, var) # sample has a higher degree of kurtosis in comparison to normal distribution, too peaked in the middle.
abline(0,1)
#Skewness and Kurtosis
library(moments)
skewness(var) # 0.02147941 # takeaway - data is very much symetrical
## [1] 0.02147941
skewness(data_normal) # 0.020939
## [1] 0.020939
kurtosis(var) # 4.097 # take away - sample has a higher degree of kurtosis (heavier peak) than a normal distribution
## [1] 4.097042
kurtosis(data_normal) # 2.933619
## [1] 2.933619
#informal tests point toward a leptokurtic distribution (too heavily peaked in middle)
fit.normal<-fitdist(var, distr="norm")
fitdist(var, 'norm')$loglik
## [1] -821.0938
plot(fit.normal)
gofstat(fit.normal)
## Goodness-of-fit statistics
## 1-mle-norm
## Kolmogorov-Smirnov statistic 0.1601767
## Cramer-von Mises statistic 1.4162856
## Anderson-Darling statistic 6.9532536
##
## Goodness-of-fit criteria
## 1-mle-norm
## Akaike's Information Criterion 1646.188
## Bayesian Information Criterion 1653.254
#Goodness-of-fit statistics
# 1-mle-norm
#Kolmogorov-Smirnov statistic 0.1601767
#Cramer-von Mises statistic 1.4162856
#Anderson-Darling statistic 6.9532536