##Investigation of School shootings in the USA
#
#Read Data into File
#Set your working directory
setwd("C:/Users/Cathie/SkyDrive/Rprogs")
#read in the data
df<-fread("./USAGunData.csv")
##Data Dictionary sources for each Field
#
# POV https://worldpopulationreview.com/state-rankings/poverty-rate-by-state
# OwnGun https://worldpopulationreview.com/state-rankings/gun-ownership-by-state
# Laws https://giffords.org/lawcenter/resources/scorecard/
# NOTE: is a ranking scale converted for Literal Grade A+ to F, A+ to C+ (1) rest 2
# ScShoot https://worldpopulationreview.com/state-rankings/school-shootings-by-state https://www.chds.us/ssdb/
# PostHigh https://worldpopulationreview.com/state-rankings/most-educated-states
# Pop https://www.infoplease.com/us/states/state-population-by-rank
# GDPp https://worldpopulationreview.com/state-rankings/gdp-by-state
# Party https://www.pewresearch.org/religion/religious-landscape-study/compare/party-affiliation/by/state/
# MHq https://www.mhanational.org/issues/ranking-states
# NOTE: MHq young facilties mental health state ranking converted to Quadrilles
#replace row number with State abbrv col 1
df <- df %>% remove_rownames %>% column_to_rownames(var="Abv")
#check structure & get summary Stats
#str(df)
#summary(df)
#what is normally distrbuted, Ho Normal, with p>0.05
xx<- apply(df[,2:7],2, shapiro.test)
pList<-as.data.frame(unlist(lapply(xx, function(x) x$p.value)))
round(pList,4)
## unlist(lapply(xx, function(x) x$p.value))
## ScShot 0.0000
## PostHigh 0.0001
## POV 0.0165
## OwnGun 0.7540
## Pop 0.0000
## GDPp 0.0000
#p<0 except for OwnGun, thus not normal distributions
par(mfrow=c(2,3))
#apply(df[,2:7],2,hist)
library(Hmisc)
hist.data.frame(df[, 2:7])
#fab reference https://r-coder.com/boxplot-r/
apply(df[,2:7],2, boxplot) #check above for match position
#look at distributions and correlations
library("GGally")
ggpairs(df[,2:7])
#correlations and significance
#install.packages("Hmisc")
library(Hmisc)
res2 <- rcorr(as.matrix(df[,2:7]))
res2 #shows groupings into pairs
## ScShot PostHigh POV OwnGun Pop GDPp
## ScShot 1.00 0.15 0.18 -0.21 0.94 0.89
## PostHigh 0.15 1.00 -0.19 -0.22 0.12 0.08
## POV 0.18 -0.19 1.00 0.38 0.09 0.03
## OwnGun -0.21 -0.22 0.38 1.00 -0.28 -0.32
## Pop 0.94 0.12 0.09 -0.28 1.00 0.98
## GDPp 0.89 0.08 0.03 -0.32 0.98 1.00
##
## n= 50
##
##
## P
## ScShot PostHigh POV OwnGun Pop GDPp
## ScShot 0.2978 0.2109 0.1522 0.0000 0.0000
## PostHigh 0.2978 0.1771 0.1302 0.4145 0.5852
## POV 0.2109 0.1771 0.0073 0.5398 0.8630
## OwnGun 0.1522 0.1302 0.0073 0.0497 0.0215
## Pop 0.0000 0.4145 0.5398 0.0497 0.0000
## GDPp 0.0000 0.5852 0.8630 0.0215 0.0000
#correlation matrix and view
t<-cor(df[, 2:7])
#gives a nice visual of correlations.
#Spearmans etc too, don't forget
library(corrplot)
corrplot(t, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)
#basic heatmap
heatmap(x = t) #everyone loves the heat!
#We see that school shotings, GDP and POP are related, not surprising
#check mean difference
#1 = Rep, 2=Dem
means<-aggregate(df[,2:7],by=list(df$Party),FUN=mean)
means
## Group.1 ScShot PostHigh POV OwnGun Pop GDPp
## 1 1 26.50000 33.78333 0.1141667 0.3002083 0.01958592 0.02041744
## 2 2 28.96154 31.98538 0.1250000 0.3580385 0.02038223 0.01961467
#Check OwnGun based on split using Party
library("dplyr")
group_by(df, Party) %>%
summarise(
mean = mean(ScShot, na.rm = TRUE),
count = n(),
median = median(ScShot, na.rm = TRUE),
IQR = IQR(ScShot, na.rm = TRUE)
)
## # A tibble: 2 x 5
## Party mean count median IQR
## <int> <dbl> <int> <dbl> <dbl>
## 1 1 26.5 24 13.5 25.5
## 2 2 29.0 26 17.5 37
#2 non-parametric tests of significant difference, difference in mean
#here one group. ref: http://www.sthda.com/english/wiki/kruskal-wallis-test-in-r
#non-parametric alternatives to ANOVA
#ANOVA has assumption of normal distribution, equal variances etc
#we seem not to have normally distributed data
kruskal.test(ScShot ~ Party, data = df) #p>0.05 hence accept Ho no difference
##
## Kruskal-Wallis rank sum test
##
## data: ScShot by Party
## Kruskal-Wallis chi-squared = 0.068824, df = 1, p-value = 0.7931
wilcox.test(ScShot ~ Party, data = df) #ditto, check others
##
## Wilcoxon rank sum test with continuity correction
##
## data: ScShot by Party
## W = 298.5, p-value = 0.8006
## alternative hypothesis: true location shift is not equal to 0
kruskal.test(POV ~ Party, data = df) #p>0.05
##
## Kruskal-Wallis rank sum test
##
## data: POV by Party
## Kruskal-Wallis chi-squared = 2.8745, df = 1, p-value = 0.08999
kruskal.test(OwnGun ~ Party, data = df) #p>0.05
##
## Kruskal-Wallis rank sum test
##
## data: OwnGun by Party
## Kruskal-Wallis chi-squared = 3.5136, df = 1, p-value = 0.06087
#Check PCA and visualisation look for patterns
colnames(df)
## [1] "State" "ScShot" "PostHigh" "POV" "OwnGun" "Pop"
## [7] "GDPp" "Party" "Laws" "MHq"
dev.off()
## null device
## 1
library(psych)
#Can we run PCA?
#1. check correlations less than 0.30 or so.
cor(df[, 2:7]) # there are quiet a few. These can be left out
## ScShot PostHigh POV OwnGun Pop GDPp
## ScShot 1.0000000 0.15019707 0.18004688 -0.2055326 0.94111543 0.89313140
## PostHigh 0.1501971 1.00000000 -0.19397638 -0.2169349 0.11798613 0.07906517
## POV 0.1800469 -0.19397638 1.00000000 0.3751640 0.08879145 0.02503469
## OwnGun -0.2055326 -0.21693492 0.37516397 1.0000000 -0.27910761 -0.32452475
## Pop 0.9411154 0.11798613 0.08879145 -0.2791076 1.00000000 0.97711320
## GDPp 0.8931314 0.07906517 0.02503469 -0.3245248 0.97711320 1.00000000
#2. Bartlett's Test - compares matrix to I, needs to vary. p<0.05
b<-cortest.bartlett(df[,2:7])
b$p.value #<0.05 hence has sufficient variance to perform data reduction
## [1] 2.713591e-50
#3. KMO (Kaiser-Meyer-Olkin)
KMO(as.matrix(df[,2:7])) #2nd and 3rd near 0.5 can be removed, others just ok
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = as.matrix(df[, 2:7]))
## Overall MSA = 0.66
## MSA for each item =
## ScShot PostHigh POV OwnGun Pop GDPp
## 0.76 0.44 0.49 0.72 0.61 0.66
KMO(as.matrix(df[,c(2,5:7)])) #bit better
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = as.matrix(df[, c(2, 5:7)]))
## Overall MSA = 0.66
## MSA for each item =
## ScShot OwnGun Pop GDPp
## 0.74 0.80 0.60 0.66
#run the PCA with all variables then again with 2nd and 3rd removed
pca <- prcomp(~ScShot+Pop+GDPp+PostHigh+POV+OwnGun, data = df,
scale = TRUE)
pca <- prcomp(~ScShot+Pop+GDPp+OwnGun, data = df,
scale = TRUE)
screeplot(pca) #elbow method, select 2
pca #shows PC1 = ScShoot, Pop, GDPp and PC2=OwnGun
## Standard deviations (1, .., p=4):
## [1] 1.7277138 0.9475452 0.3214783 0.1175366
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## ScShot -0.5489689 0.2053607 -0.7729217 -0.24300648
## Pop -0.5709752 0.1270069 0.1914848 0.78815619
## GDPp -0.5656026 0.0654874 0.5968523 -0.56530740
## OwnGun 0.2295958 0.9681981 0.0984525 -0.01360962
biplot(pca)
fviz_pca_biplot(pca, habillage = df$Party ,
col.var="black",
repel=TRUE, axes = c(1, 2))
#NOTE there are two other grouping variables
#1. Laws - derived by me see above
#2. MHq - ditto
fviz_pca_biplot(pca, habillage = df$Laws,
col.var="black",
repel=TRUE, axes = c(1, 2))
fviz_pca_biplot(pca, habillage = df$MHq,
col.var="black",
repel=TRUE, axes = c(1, 2))
### Go Back and repeat the grouping statistics.
#barplot a frequency distribution
barplot(table(df$POV))
# Box plots ref: http://www.sthda.com/english/wiki/one-way-anova-test-in-r
# ++++++++++++++++++++
# Plot weight by group and color by group
library("ggpubr")
ggboxplot(df, x = "Party", y = "POV",
color = "Party", palette = c("#00AFBB", "#E7B800"),
order = c(1,2),
ylab = "POV", xlab = "Party")
library("gplots")
plotmeans(POV ~ Party, data = df, frame = FALSE,
xlab = "Treatment", ylab = "Weight",
main="Mean Plot POV by Party with 95% CI")
#check clusterability, close to uniform, 0.5 random, close to 1 highly
clusterable str(df)
cl<-get_clust_tendency(data = df[,2:7], n = 5)
cl$hopkins_stat #low, best near 1, but that does not mean there are none
## [1] 0.5672689
# Compute PAM Clusters version 1 - simplest syntax structure - many available
library("cluster")
p<- pam(df[,2:7], 3) #3 is the desired clusters, three 3 etc. Observation looks like Pop
# Visualize
fviz_cluster(p) #TX and CA both have highest number of school shootings
# Hierarchical clustering
hc <- hclust(dist(df[,2:7]))
# Color labels using k-means clusters
km.clust <- kmeans(df[,2:7], 4)$cluster
fviz_dend(hc, k = 4,
k_colors = c("blue", "green3", "red", "black"),
label_cols = km.clust[hc$order], cex = 0.6)
#Factor analysis - common factor anlysis? https://www.statmethods.net/advstats/factor.html
dfact <- factanal(df[,2:7], factors = 3)
dfact
##
## Call:
## factanal(x = df[, 2:7], factors = 3)
##
## Uniquenesses:
## ScShot PostHigh POV OwnGun Pop GDPp
## 0.056 0.708 0.492 0.597 0.012 0.005
##
## Loadings:
## Factor1 Factor2 Factor3
## ScShot 0.939 0.114 0.223
## PostHigh -0.194 0.500
## POV 0.140 0.684 -0.143
## OwnGun -0.239 0.559 -0.182
## Pop 0.989
## GDPp 0.982 -0.173
##
## Factor1 Factor2 Factor3
## SS loadings 2.905 0.864 0.362
## Proportion Var 0.484 0.144 0.060
## Cumulative Var 0.484 0.628 0.688
##
## The degrees of freedom for the model is 0 and the fit was 0.002
apply(dfact$loadings^2,1,sum) # communality
## ScShot PostHigh POV OwnGun Pop GDPp
## 0.9438037 0.2922352 0.5079460 0.4025249 0.9884430 0.9950028
1 - apply(dfact$loadings^2,1,sum) # uniqueness
## ScShot PostHigh POV OwnGun Pop GDPp
## 0.056196335 0.707764782 0.492053999 0.597475059 0.011557004 0.004997215
Lambda<-dfact$loadings
Psi<-diag(dfact$uniquenesses)
S<-dfact$correlation
Sigma<-Lambda %*% t(Lambda) + Psi
round(S - Sigma, 6)
## ScShot PostHigh POV OwnGun Pop GDPp
## ScShot 0.000000 -0.001115 0.002562 -0.004629 0.000003 -0.000014
## PostHigh -0.001115 0.000095 0.001390 -0.001556 0.000470 -0.000115
## POV 0.002562 0.001390 0.000018 0.000215 -0.001205 0.000307
## OwnGun -0.004629 -0.001556 0.000215 0.000026 0.002132 -0.000535
## Pop 0.000003 0.000470 -0.001205 0.002132 0.000000 0.000006
## GDPp -0.000014 -0.000115 0.000307 -0.000535 0.000006 -0.000003
#regression
#we have seen with PCA despite lowish correlations that there are groups
#factor analysis confirmed these
#fit linear regression model
plot(x=df$Pop,y=df$ScShot)
abline(lm(ScShot~Pop,data=df),col='red')
text(x = df$Pop, y = df$ScShot,labels=rownames(df))
model<- lm(df$ScShot~df$Pop)
model
##
## Call:
## lm(formula = df$ScShot ~ df$Pop)
##
## Coefficients:
## (Intercept) df$Pop
## 0.5291 1362.5451
library(lmtest)
#perform Breusch-Pagan Test - looking for a pattern in the errors/residuals
bptest(model) #p>0.05 accept HO of no pattern - good to go
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 2.2212, df = 1, p-value = 0.1361
#multiple regression
#gone overboard here just shows how to do it
model <- lm(df$ScShot~df$Pop+df$GDPp+df$PostHigh+df$POV+df$OwnGun)
model
##
## Call:
## lm(formula = df$ScShot ~ df$Pop + df$GDPp + df$PostHigh + df$POV +
## df$OwnGun)
##
## Coefficients:
## (Intercept) df$Pop df$GDPp df$PostHigh df$POV df$OwnGun
## -18.6352 1967.7854 -549.1111 0.1845 90.6631 3.3864
bptest(model) #again p>0.05 ok
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 6.6774, df = 5, p-value = 0.2458
summary(model)
##
## Call:
## lm(formula = df$ScShot ~ df$Pop + df$GDPp + df$PostHigh + df$POV +
## df$OwnGun)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.692 -5.797 0.073 4.222 24.954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -18.6352 11.8049 -1.579 0.1216
## df$Pop 1967.7854 343.8562 5.723 8.61e-07 ***
## df$GDPp -549.1111 299.9167 -1.831 0.0739 .
## df$PostHigh 0.1845 0.2206 0.837 0.4074
## df$POV 90.6631 65.9829 1.374 0.1764
## df$OwnGun 3.3864 12.9952 0.261 0.7956
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.53 on 44 degrees of freedom
## Multiple R-squared: 0.9067, Adjusted R-squared: 0.8961
## F-statistic: 85.51 on 5 and 44 DF, p-value: < 2.2e-16
plot(model)
plot(resid(model))
abline(0, 0)
model2 <- model <- lm(log(df$ScShot)~df$Pop+df$GDPp+df$PostHigh+df$POV+df$OwnGun)
summary(model2)
##
## Call:
## lm(formula = log(df$ScShot) ~ df$Pop + df$GDPp + df$PostHigh +
## df$POV + df$OwnGun)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5118 -0.4848 0.1683 0.4595 1.3289
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.80075 0.81048 0.988 0.32856
## df$Pop 79.81232 23.60791 3.381 0.00153 **
## df$GDPp -40.34271 20.59119 -1.959 0.05644 .
## df$PostHigh 0.01203 0.01515 0.795 0.43109
## df$POV 11.13255 4.53014 2.457 0.01800 *
## df$OwnGun -1.81501 0.89220 -2.034 0.04798 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7228 on 44 degrees of freedom
## Multiple R-squared: 0.6668, Adjusted R-squared: 0.6289
## F-statistic: 17.61 on 5 and 44 DF, p-value: 1.502e-09
#ANOVA - check assumptions.
#1. Normal distributions - NO, see earlier Shapior test
colnames(df)
## [1] "State" "ScShot" "PostHigh" "POV" "OwnGun" "Pop"
## [7] "GDPp" "Party" "Laws" "MHq"
#we can the sub-sets
list1<-df[df$Party==1,2]
list2<-df[df$Party==2,2]
shapiro.test(list2) #both sub-sets are not Normal p<0.05 Accept Alternative Ho
##
## Shapiro-Wilk normality test
##
## data: list2
## W = 0.80154, p-value = 0.000187
#2. equal variance - Seems that way
library(car)
#The Fligner-Killeen test is one of the many tests for homogeneity of variance
#which is most robust against departures from normality.
fligner.test(ScShot ~ Party, data = df) #p>0.05 hence accept Ho of no var difference
##
## Fligner-Killeen test of homogeneity of variances
##
## data: ScShot by Party
## Fligner-Killeen:med chi-squared = 1.4845, df = 1, p-value = 0.2231
#3. few significant outliers - check CA and TX
# Compute the analysis of variance - repeat on different varables as desired
aov <- aov(ScShot ~ as.factor(Party), data = df)
# Summary of the analysis
summary(aov) #p>0.05 no difference
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Party) 1 76 75.6 0.07 0.793
## Residuals 48 52181 1087.1
TukeyHSD(aov) #p>0.05 no difference
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = ScShot ~ as.factor(Party), data = df)
##
## $`as.factor(Party)`
## diff lwr upr p adj
## 2-1 2.461538 -16.30401 21.22709 0.7931081
leveneTest(ScShot ~ as.factor(Party), data = df) #p>0.05 no difference in var
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.1677 0.684
## 48
# 1. Homogeneity of variances
plot(aov, 1) # ok, but outliers TX and CA
# 2. Normality
plot(aov, 2) #residuals follow line? Not really, maybe remove outliers
# Extract the residuals
aov_residuals <- residuals(object = aov )
# Run Shapiro-Wilk test
shapiro.test(x = aov_residuals ) #p<0.05 reject Ho residuals not normal
##
## Shapiro-Wilk normality test
##
## data: aov_residuals
## W = 0.73134, p-value = 3.162e-08
plot(aov_residuals) #scattered
#Look at training models and prediction
library(MASS)
library(caret)
# Split the data into training and test set
set.seed(123)
train.data <- df[1:40, ]
test.data <- df[41:50, ]
ggplot(train.data, aes(OwnGun, ScShot) ) +
geom_point() +
stat_smooth()
#Other models http://www.sthda.com/english/articles/40-regression-analysis/162-nonlinear-regression-essentials-in-r-polynomial-and-spline-regression-models/
# Build the model - poly Nth
model <- lm(ScShot ~ OwnGun, data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
predictions
## SD TN TX UT VT VA WA WV
## 26.36304 24.25373 26.02747 27.80120 29.23936 29.23936 29.71875 17.15880
## WI WY
## 26.36304 17.25468
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$OwnGun),
R2 = R2(predictions, test.data$ScShot)
)
## RMSE R2
## 1 25.3675 0.03150978
ggplot(train.data, aes(OwnGun, ScShot) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ x)
lm(ScShot ~ poly(OwnGun, 2, raw = TRUE), data = train.data)
##
## Call:
## lm(formula = ScShot ~ poly(OwnGun, 2, raw = TRUE), data = train.data)
##
## Coefficients:
## (Intercept) poly(OwnGun, 2, raw = TRUE)1
## 15.15 160.27
## poly(OwnGun, 2, raw = TRUE)2
## -315.50
lm(ScShot ~ poly(OwnGun, 6, raw = TRUE), data = train.data) %>%
summary()
##
## Call:
## lm(formula = ScShot ~ poly(OwnGun, 6, raw = TRUE), data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.432 -15.319 -3.548 9.372 115.661
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74.12 158.17 0.469 0.642
## poly(OwnGun, 6, raw = TRUE)1 -2679.29 4820.94 -0.556 0.582
## poly(OwnGun, 6, raw = TRUE)2 36026.38 51222.53 0.703 0.487
## poly(OwnGun, 6, raw = TRUE)3 -192968.35 256353.42 -0.753 0.457
## poly(OwnGun, 6, raw = TRUE)4 491537.82 656877.72 0.748 0.460
## poly(OwnGun, 6, raw = TRUE)5 -599097.82 832988.28 -0.719 0.477
## poly(OwnGun, 6, raw = TRUE)6 281487.70 413876.11 0.680 0.501
##
## Residual standard error: 30.34 on 33 degrees of freedom
## Multiple R-squared: 0.1902, Adjusted R-squared: 0.04292
## F-statistic: 1.292 on 6 and 33 DF, p-value: 0.2882
# Build the model
model <- lm(ScShot ~ poly(OwnGun, 2, raw = TRUE), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$ScShot),
R2 = R2(predictions, test.data$ScShot)
)
## RMSE R2
## 1 37.06247 0.06894787
ggplot(train.data, aes(OwnGun, ScShot) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ poly(x, 2, raw = TRUE))
#Log transformations
# Build the model
model <- lm(ScShot ~ log(OwnGun), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$ScShot),
R2 = R2(predictions, test.data$ScShot)
)
## RMSE R2
## 1 38.26886 0.01893637
ggplot(train.data, aes(OwnGun, ScShot) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ log(x))
#Spline Sets
knots <- quantile(train.data$OwnGun, p = c(0.25, 0.5, 0.75))
library(splines)
# Build the model
knots <- quantile(train.data$lstat, p = c(0.25, 0.5, 0.75))
model <- lm (ScShot ~ bs(OwnGun, knots = knots), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$ScShot),
R2 = R2(predictions, test.data$ScShot)
)
## RMSE R2
## 1 38.3395 0.02127615
ggplot(train.data, aes(OwnGun, ScShot) ) +
geom_point() +
stat_smooth(method = lm, formula = y ~ splines::bs(x, df = 3))
#
library(mgcv)
# Build the model
model <- gam(ScShot ~ s(OwnGun), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
RMSE = RMSE(predictions, test.data$ScShot),
R2 = R2(predictions, test.data$ScShot)
)
## RMSE R2
## 1 37.82311 0.03142867
ggplot(train.data, aes(OwnGun, ScShot) ) +
geom_point() +
stat_smooth(method = gam, formula = y ~ s(x))