##Investigation of School shootings in the USA
#
#Read Data into File
#Set your working directory
setwd("C:/Users/Cathie/SkyDrive/Rprogs")
#read in the data
df<-fread("./USAGunData.csv") 

##Data Dictionary sources for each Field
#
# POV     https://worldpopulationreview.com/state-rankings/poverty-rate-by-state
# OwnGun  https://worldpopulationreview.com/state-rankings/gun-ownership-by-state
# Laws      https://giffords.org/lawcenter/resources/scorecard/             
# NOTE: is a ranking scale converted for Literal Grade A+ to F, A+ to C+ (1) rest 2
# ScShoot https://worldpopulationreview.com/state-rankings/school-shootings-by-state https://www.chds.us/ssdb/
# PostHigh https://worldpopulationreview.com/state-rankings/most-educated-states
# Pop      https://www.infoplease.com/us/states/state-population-by-rank
# GDPp    https://worldpopulationreview.com/state-rankings/gdp-by-state
# Party   https://www.pewresearch.org/religion/religious-landscape-study/compare/party-affiliation/by/state/
# MHq     https://www.mhanational.org/issues/ranking-states
# NOTE:   MHq young facilties mental health state ranking converted to Quadrilles

#replace row number with State abbrv col 1
df <- df %>% remove_rownames %>% column_to_rownames(var="Abv")

#check structure & get summary Stats
#str(df) 
#summary(df)
#what is normally distrbuted, Ho Normal, with p>0.05
xx<- apply(df[,2:7],2, shapiro.test)
pList<-as.data.frame(unlist(lapply(xx, function(x) x$p.value)))
round(pList,4)
##          unlist(lapply(xx, function(x) x$p.value))
## ScShot                                      0.0000
## PostHigh                                    0.0001
## POV                                         0.0165
## OwnGun                                      0.7540
## Pop                                         0.0000
## GDPp                                        0.0000
#p<0 except for OwnGun, thus not normal distributions
par(mfrow=c(2,3))
#apply(df[,2:7],2,hist) 
library(Hmisc)
hist.data.frame(df[, 2:7])

#fab reference https://r-coder.com/boxplot-r/
apply(df[,2:7],2, boxplot) #check above for match position

#look at distributions and correlations
library("GGally")                     
ggpairs(df[,2:7])  
#correlations and significance
#install.packages("Hmisc")
library(Hmisc)
res2 <- rcorr(as.matrix(df[,2:7]))
res2 #shows groupings into pairs
##          ScShot PostHigh   POV OwnGun   Pop  GDPp
## ScShot     1.00     0.15  0.18  -0.21  0.94  0.89
## PostHigh   0.15     1.00 -0.19  -0.22  0.12  0.08
## POV        0.18    -0.19  1.00   0.38  0.09  0.03
## OwnGun    -0.21    -0.22  0.38   1.00 -0.28 -0.32
## Pop        0.94     0.12  0.09  -0.28  1.00  0.98
## GDPp       0.89     0.08  0.03  -0.32  0.98  1.00
## 
## n= 50 
## 
## 
## P
##          ScShot PostHigh POV    OwnGun Pop    GDPp  
## ScShot          0.2978   0.2109 0.1522 0.0000 0.0000
## PostHigh 0.2978          0.1771 0.1302 0.4145 0.5852
## POV      0.2109 0.1771          0.0073 0.5398 0.8630
## OwnGun   0.1522 0.1302   0.0073        0.0497 0.0215
## Pop      0.0000 0.4145   0.5398 0.0497        0.0000
## GDPp     0.0000 0.5852   0.8630 0.0215 0.0000
#correlation matrix and view
t<-cor(df[, 2:7])
#gives a nice visual of correlations.
#Spearmans etc too, don't forget
library(corrplot) 
corrplot(t, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

#basic heatmap
heatmap(x = t) #everyone loves the heat!

#We see that school shotings, GDP and POP are related, not surprising
#check mean difference
#1 = Rep, 2=Dem
means<-aggregate(df[,2:7],by=list(df$Party),FUN=mean)
means
##   Group.1   ScShot PostHigh       POV    OwnGun        Pop       GDPp
## 1       1 26.50000 33.78333 0.1141667 0.3002083 0.01958592 0.02041744
## 2       2 28.96154 31.98538 0.1250000 0.3580385 0.02038223 0.01961467
#Check OwnGun based on split using Party
library("dplyr")
group_by(df, Party) %>%
  summarise(
    mean = mean(ScShot, na.rm = TRUE),
    count = n(),
    median = median(ScShot, na.rm = TRUE),
    IQR = IQR(ScShot, na.rm = TRUE)
  )
## # A tibble: 2 x 5
##   Party  mean count median   IQR
##   <int> <dbl> <int>  <dbl> <dbl>
## 1     1  26.5    24   13.5  25.5
## 2     2  29.0    26   17.5  37
#2 non-parametric tests of significant difference, difference in mean 
#here one group. ref: http://www.sthda.com/english/wiki/kruskal-wallis-test-in-r
#non-parametric alternatives to ANOVA 
#ANOVA has assumption of normal distribution, equal variances etc
#we seem not to have normally distributed data

kruskal.test(ScShot ~ Party, data = df) #p>0.05 hence accept Ho no difference
## 
##  Kruskal-Wallis rank sum test
## 
## data:  ScShot by Party
## Kruskal-Wallis chi-squared = 0.068824, df = 1, p-value = 0.7931
wilcox.test(ScShot ~ Party, data = df) #ditto, check others
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ScShot by Party
## W = 298.5, p-value = 0.8006
## alternative hypothesis: true location shift is not equal to 0
kruskal.test(POV ~ Party, data = df) #p>0.05 
## 
##  Kruskal-Wallis rank sum test
## 
## data:  POV by Party
## Kruskal-Wallis chi-squared = 2.8745, df = 1, p-value = 0.08999
kruskal.test(OwnGun ~ Party, data = df) #p>0.05 
## 
##  Kruskal-Wallis rank sum test
## 
## data:  OwnGun by Party
## Kruskal-Wallis chi-squared = 3.5136, df = 1, p-value = 0.06087
#Check PCA and visualisation look for patterns

colnames(df)
##  [1] "State"    "ScShot"   "PostHigh" "POV"      "OwnGun"   "Pop"     
##  [7] "GDPp"     "Party"    "Laws"     "MHq"
dev.off()
## null device 
##           1
library(psych)
#Can we run PCA?
#1. check correlations less than 0.30 or so.
cor(df[, 2:7]) # there are quiet a few. These can be left out
##              ScShot    PostHigh         POV     OwnGun         Pop        GDPp
## ScShot    1.0000000  0.15019707  0.18004688 -0.2055326  0.94111543  0.89313140
## PostHigh  0.1501971  1.00000000 -0.19397638 -0.2169349  0.11798613  0.07906517
## POV       0.1800469 -0.19397638  1.00000000  0.3751640  0.08879145  0.02503469
## OwnGun   -0.2055326 -0.21693492  0.37516397  1.0000000 -0.27910761 -0.32452475
## Pop       0.9411154  0.11798613  0.08879145 -0.2791076  1.00000000  0.97711320
## GDPp      0.8931314  0.07906517  0.02503469 -0.3245248  0.97711320  1.00000000
#2. Bartlett's Test - compares matrix to I, needs to vary. p<0.05
b<-cortest.bartlett(df[,2:7])
b$p.value #<0.05 hence has sufficient variance to perform data reduction
## [1] 2.713591e-50
#3. KMO (Kaiser-Meyer-Olkin)
KMO(as.matrix(df[,2:7])) #2nd and 3rd near 0.5 can be removed, others just ok
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = as.matrix(df[, 2:7]))
## Overall MSA =  0.66
## MSA for each item = 
##   ScShot PostHigh      POV   OwnGun      Pop     GDPp 
##     0.76     0.44     0.49     0.72     0.61     0.66
KMO(as.matrix(df[,c(2,5:7)])) #bit better
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = as.matrix(df[, c(2, 5:7)]))
## Overall MSA =  0.66
## MSA for each item = 
## ScShot OwnGun    Pop   GDPp 
##   0.74   0.80   0.60   0.66
#run the PCA with all variables then again with 2nd and 3rd removed
pca <- prcomp(~ScShot+Pop+GDPp+PostHigh+POV+OwnGun, data = df,
                 scale = TRUE)
pca <- prcomp(~ScShot+Pop+GDPp+OwnGun, data = df,
              scale = TRUE)
screeplot(pca) #elbow method, select 2

pca #shows PC1 = ScShoot, Pop, GDPp and PC2=OwnGun
## Standard deviations (1, .., p=4):
## [1] 1.7277138 0.9475452 0.3214783 0.1175366
## 
## Rotation (n x k) = (4 x 4):
##               PC1       PC2        PC3         PC4
## ScShot -0.5489689 0.2053607 -0.7729217 -0.24300648
## Pop    -0.5709752 0.1270069  0.1914848  0.78815619
## GDPp   -0.5656026 0.0654874  0.5968523 -0.56530740
## OwnGun  0.2295958 0.9681981  0.0984525 -0.01360962
biplot(pca)

fviz_pca_biplot(pca, habillage = df$Party ,
                col.var="black",
                repel=TRUE, axes = c(1, 2))

#NOTE there are two other grouping variables
#1. Laws - derived by me see above
#2. MHq  - ditto

fviz_pca_biplot(pca, habillage = df$Laws,
                col.var="black",
                repel=TRUE, axes = c(1, 2))

fviz_pca_biplot(pca, habillage = df$MHq,
                col.var="black",
                repel=TRUE, axes = c(1, 2))

### Go Back and repeat the grouping statistics.

#barplot a frequency distribution

barplot(table(df$POV))

# Box plots ref: http://www.sthda.com/english/wiki/one-way-anova-test-in-r
# ++++++++++++++++++++
# Plot weight by group and color by group
library("ggpubr")
ggboxplot(df, x = "Party", y = "POV", 
          color = "Party", palette = c("#00AFBB", "#E7B800"),
          order = c(1,2),
          ylab = "POV", xlab = "Party")

library("gplots")
plotmeans(POV ~ Party, data = df, frame = FALSE,
          xlab = "Treatment", ylab = "Weight",
          main="Mean Plot POV by Party with 95% CI") 

#check clusterability, close to uniform, 0.5 random, close to 1 highly clusterable str(df)

cl<-get_clust_tendency(data = df[,2:7], n = 5)
cl$hopkins_stat #low, best near 1, but that does not mean there are none
## [1] 0.5672689
# Compute PAM Clusters version 1 - simplest syntax structure - many available
library("cluster")
p<- pam(df[,2:7], 3) #3 is the desired clusters, three 3 etc. Observation looks like Pop
# Visualize
fviz_cluster(p) #TX and CA both have highest number of school shootings

# Hierarchical clustering
hc <- hclust(dist(df[,2:7]))
# Color labels using k-means clusters
km.clust <- kmeans(df[,2:7], 4)$cluster
fviz_dend(hc, k = 4,
          k_colors = c("blue", "green3", "red", "black"),
          label_cols =  km.clust[hc$order], cex = 0.6)

#Factor analysis - common factor anlysis? https://www.statmethods.net/advstats/factor.html
dfact <- factanal(df[,2:7], factors = 3)
dfact
## 
## Call:
## factanal(x = df[, 2:7], factors = 3)
## 
## Uniquenesses:
##   ScShot PostHigh      POV   OwnGun      Pop     GDPp 
##    0.056    0.708    0.492    0.597    0.012    0.005 
## 
## Loadings:
##          Factor1 Factor2 Factor3
## ScShot    0.939   0.114   0.223 
## PostHigh         -0.194   0.500 
## POV       0.140   0.684  -0.143 
## OwnGun   -0.239   0.559  -0.182 
## Pop       0.989                 
## GDPp      0.982  -0.173         
## 
##                Factor1 Factor2 Factor3
## SS loadings      2.905   0.864   0.362
## Proportion Var   0.484   0.144   0.060
## Cumulative Var   0.484   0.628   0.688
## 
## The degrees of freedom for the model is 0 and the fit was 0.002
apply(dfact$loadings^2,1,sum) # communality
##    ScShot  PostHigh       POV    OwnGun       Pop      GDPp 
## 0.9438037 0.2922352 0.5079460 0.4025249 0.9884430 0.9950028
1 - apply(dfact$loadings^2,1,sum) # uniqueness
##      ScShot    PostHigh         POV      OwnGun         Pop        GDPp 
## 0.056196335 0.707764782 0.492053999 0.597475059 0.011557004 0.004997215
Lambda<-dfact$loadings
Psi<-diag(dfact$uniquenesses)
S<-dfact$correlation
Sigma<-Lambda %*% t(Lambda) + Psi
round(S - Sigma, 6)
##             ScShot  PostHigh       POV    OwnGun       Pop      GDPp
## ScShot    0.000000 -0.001115  0.002562 -0.004629  0.000003 -0.000014
## PostHigh -0.001115  0.000095  0.001390 -0.001556  0.000470 -0.000115
## POV       0.002562  0.001390  0.000018  0.000215 -0.001205  0.000307
## OwnGun   -0.004629 -0.001556  0.000215  0.000026  0.002132 -0.000535
## Pop       0.000003  0.000470 -0.001205  0.002132  0.000000  0.000006
## GDPp     -0.000014 -0.000115  0.000307 -0.000535  0.000006 -0.000003
#regression
#we have seen with PCA despite lowish correlations that there are groups
#factor analysis confirmed these

#fit linear regression model
plot(x=df$Pop,y=df$ScShot)
abline(lm(ScShot~Pop,data=df),col='red')
text(x = df$Pop, y = df$ScShot,labels=rownames(df))

model<- lm(df$ScShot~df$Pop)
model
## 
## Call:
## lm(formula = df$ScShot ~ df$Pop)
## 
## Coefficients:
## (Intercept)       df$Pop  
##      0.5291    1362.5451
library(lmtest)
#perform Breusch-Pagan Test - looking for a pattern in the errors/residuals
bptest(model) #p>0.05 accept HO of no pattern - good to go
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 2.2212, df = 1, p-value = 0.1361
#multiple regression
#gone overboard here just shows how to do it
model <- lm(df$ScShot~df$Pop+df$GDPp+df$PostHigh+df$POV+df$OwnGun)
model
## 
## Call:
## lm(formula = df$ScShot ~ df$Pop + df$GDPp + df$PostHigh + df$POV + 
##     df$OwnGun)
## 
## Coefficients:
## (Intercept)       df$Pop      df$GDPp  df$PostHigh       df$POV    df$OwnGun  
##    -18.6352    1967.7854    -549.1111       0.1845      90.6631       3.3864
bptest(model) #again p>0.05 ok
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 6.6774, df = 5, p-value = 0.2458
summary(model)
## 
## Call:
## lm(formula = df$ScShot ~ df$Pop + df$GDPp + df$PostHigh + df$POV + 
##     df$OwnGun)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.692  -5.797   0.073   4.222  24.954 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -18.6352    11.8049  -1.579   0.1216    
## df$Pop      1967.7854   343.8562   5.723 8.61e-07 ***
## df$GDPp     -549.1111   299.9167  -1.831   0.0739 .  
## df$PostHigh    0.1845     0.2206   0.837   0.4074    
## df$POV        90.6631    65.9829   1.374   0.1764    
## df$OwnGun      3.3864    12.9952   0.261   0.7956    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.53 on 44 degrees of freedom
## Multiple R-squared:  0.9067, Adjusted R-squared:  0.8961 
## F-statistic: 85.51 on 5 and 44 DF,  p-value: < 2.2e-16
plot(model)

plot(resid(model))
abline(0, 0) 

model2 <- model <- lm(log(df$ScShot)~df$Pop+df$GDPp+df$PostHigh+df$POV+df$OwnGun)
summary(model2)
## 
## Call:
## lm(formula = log(df$ScShot) ~ df$Pop + df$GDPp + df$PostHigh + 
##     df$POV + df$OwnGun)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5118 -0.4848  0.1683  0.4595  1.3289 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   0.80075    0.81048   0.988  0.32856   
## df$Pop       79.81232   23.60791   3.381  0.00153 **
## df$GDPp     -40.34271   20.59119  -1.959  0.05644 . 
## df$PostHigh   0.01203    0.01515   0.795  0.43109   
## df$POV       11.13255    4.53014   2.457  0.01800 * 
## df$OwnGun    -1.81501    0.89220  -2.034  0.04798 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7228 on 44 degrees of freedom
## Multiple R-squared:  0.6668, Adjusted R-squared:  0.6289 
## F-statistic: 17.61 on 5 and 44 DF,  p-value: 1.502e-09
#ANOVA - check assumptions. 
#1. Normal distributions - NO, see earlier Shapior test
colnames(df)
##  [1] "State"    "ScShot"   "PostHigh" "POV"      "OwnGun"   "Pop"     
##  [7] "GDPp"     "Party"    "Laws"     "MHq"
#we can the sub-sets
list1<-df[df$Party==1,2]
list2<-df[df$Party==2,2]
shapiro.test(list2) #both sub-sets are not Normal p<0.05 Accept Alternative Ho
## 
##  Shapiro-Wilk normality test
## 
## data:  list2
## W = 0.80154, p-value = 0.000187
#2. equal variance - Seems that way

library(car)
#The Fligner-Killeen test is one of the many tests for homogeneity of variance
#which is most robust against departures from normality.
fligner.test(ScShot ~ Party, data = df) #p>0.05 hence accept Ho of no var difference
## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  ScShot by Party
## Fligner-Killeen:med chi-squared = 1.4845, df = 1, p-value = 0.2231
#3. few significant outliers - check CA and TX

# Compute the analysis of variance - repeat on different varables as desired
aov <- aov(ScShot ~ as.factor(Party), data = df)

# Summary of the analysis
summary(aov) #p>0.05 no difference
##                  Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Party)  1     76    75.6    0.07  0.793
## Residuals        48  52181  1087.1
TukeyHSD(aov) #p>0.05 no difference
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = ScShot ~ as.factor(Party), data = df)
## 
## $`as.factor(Party)`
##         diff       lwr      upr     p adj
## 2-1 2.461538 -16.30401 21.22709 0.7931081
leveneTest(ScShot ~ as.factor(Party), data = df) #p>0.05 no difference in var
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  1  0.1677  0.684
##       48
# 1. Homogeneity of variances
plot(aov, 1) # ok, but outliers TX and CA

# 2. Normality
plot(aov, 2) #residuals follow line? Not really, maybe remove outliers

# Extract the residuals
aov_residuals <- residuals(object = aov )
# Run Shapiro-Wilk test
shapiro.test(x = aov_residuals ) #p<0.05 reject Ho residuals not normal
## 
##  Shapiro-Wilk normality test
## 
## data:  aov_residuals
## W = 0.73134, p-value = 3.162e-08
plot(aov_residuals) #scattered

#Look at training models and prediction
library(MASS)
library(caret)
# Split the data into training and test set
set.seed(123)

train.data  <- df[1:40, ]
test.data <- df[41:50, ]

ggplot(train.data, aes(OwnGun, ScShot) ) +
  geom_point() +
  stat_smooth()

#Other models http://www.sthda.com/english/articles/40-regression-analysis/162-nonlinear-regression-essentials-in-r-polynomial-and-spline-regression-models/
# Build the model - poly Nth
model <- lm(ScShot ~ OwnGun, data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
predictions
##       SD       TN       TX       UT       VT       VA       WA       WV 
## 26.36304 24.25373 26.02747 27.80120 29.23936 29.23936 29.71875 17.15880 
##       WI       WY 
## 26.36304 17.25468
# Model performance
data.frame(
  RMSE = RMSE(predictions, test.data$OwnGun),
  R2 = R2(predictions, test.data$ScShot)
)
##      RMSE         R2
## 1 25.3675 0.03150978
ggplot(train.data, aes(OwnGun, ScShot) ) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ x)

lm(ScShot ~ poly(OwnGun, 2, raw = TRUE), data = train.data)
## 
## Call:
## lm(formula = ScShot ~ poly(OwnGun, 2, raw = TRUE), data = train.data)
## 
## Coefficients:
##                  (Intercept)  poly(OwnGun, 2, raw = TRUE)1  
##                        15.15                        160.27  
## poly(OwnGun, 2, raw = TRUE)2  
##                      -315.50
lm(ScShot ~ poly(OwnGun, 6, raw = TRUE), data = train.data) %>%
  summary()
## 
## Call:
## lm(formula = ScShot ~ poly(OwnGun, 6, raw = TRUE), data = train.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.432 -15.319  -3.548   9.372 115.661 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)
## (Intercept)                       74.12     158.17   0.469    0.642
## poly(OwnGun, 6, raw = TRUE)1   -2679.29    4820.94  -0.556    0.582
## poly(OwnGun, 6, raw = TRUE)2   36026.38   51222.53   0.703    0.487
## poly(OwnGun, 6, raw = TRUE)3 -192968.35  256353.42  -0.753    0.457
## poly(OwnGun, 6, raw = TRUE)4  491537.82  656877.72   0.748    0.460
## poly(OwnGun, 6, raw = TRUE)5 -599097.82  832988.28  -0.719    0.477
## poly(OwnGun, 6, raw = TRUE)6  281487.70  413876.11   0.680    0.501
## 
## Residual standard error: 30.34 on 33 degrees of freedom
## Multiple R-squared:  0.1902, Adjusted R-squared:  0.04292 
## F-statistic: 1.292 on 6 and 33 DF,  p-value: 0.2882
# Build the model
model <- lm(ScShot ~ poly(OwnGun, 2, raw = TRUE), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
  RMSE = RMSE(predictions, test.data$ScShot),
  R2 = R2(predictions, test.data$ScShot)
)
##       RMSE         R2
## 1 37.06247 0.06894787
ggplot(train.data, aes(OwnGun, ScShot) ) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ poly(x, 2, raw = TRUE))

#Log transformations
# Build the model
model <- lm(ScShot ~ log(OwnGun), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
  RMSE = RMSE(predictions, test.data$ScShot),
  R2 = R2(predictions, test.data$ScShot)
)
##       RMSE         R2
## 1 38.26886 0.01893637
ggplot(train.data, aes(OwnGun, ScShot) ) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ log(x))

#Spline Sets
knots <- quantile(train.data$OwnGun, p = c(0.25, 0.5, 0.75))
library(splines)
# Build the model
knots <- quantile(train.data$lstat, p = c(0.25, 0.5, 0.75))
model <- lm (ScShot ~ bs(OwnGun, knots = knots), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
  RMSE = RMSE(predictions, test.data$ScShot),
  R2 = R2(predictions, test.data$ScShot)
)
##      RMSE         R2
## 1 38.3395 0.02127615
ggplot(train.data, aes(OwnGun, ScShot) ) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ splines::bs(x, df = 3))

#
library(mgcv)
# Build the model
model <- gam(ScShot ~ s(OwnGun), data = train.data)
# Make predictions
predictions <- model %>% predict(test.data)
# Model performance
data.frame(
  RMSE = RMSE(predictions, test.data$ScShot),
  R2 = R2(predictions, test.data$ScShot)
)
##       RMSE         R2
## 1 37.82311 0.03142867
ggplot(train.data, aes(OwnGun, ScShot) ) +
  geom_point() +
  stat_smooth(method = gam, formula = y ~ s(x))