rm(list=ls())
#install.packages("ggplot2")
#install.packages("dplyr")
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df.gdp <- read.csv("GDP_raw_data_030522.csv")
df.crime <- read.csv("Crime_raw_data_031122.csv")
gdp_hf <- df.gdp %>% filter(series==
"GDP per capita (US dollars)")
x2005 <- gdp_hf[gdp_hf$year=="2005",]$value
hist(x2005, main = "Distribution of GDP per Capita in 2005", breaks = 20)
x2018 <- gdp_hf[gdp_hf$year=="2018",]$value
hist(x2018, main = "Distribution of GDP per Capita in 2018", breaks = 20)
### Most countries are below $50K per capita. Not a normal distribution but it is plausible due to number of underdeveloped/poor nations.
gdp_filt <- df.gdp %>% filter(series==
"GDP per capita (US dollars)")
#gdp_filt <- df.gdp %>% filter(year %in% c("2005","2010","2018")) commented out bc lines in graph dont group w these selected years, and not needed
gdp_nc <- gdp_filt %>% filter(country %in% c("Argentina", "Brazil","Canada", "China", "Colombia","El Salvador","France", "Germany", "Greece","Guatemala","India", "Iceland","Italy","Japan", "Kenya","Mexico", "Netherlands","Norway","Philippines", "Russian Federation", "Singapore", "South Africa","Spain", "United States of America"))
gdp_nc$value <- as.numeric(gdp_nc$value)
gdp_nc %>% ggplot(., aes(year,value, color=country))+
geom_line()+
geom_point()+
ggtitle("GDP per Capita in US Dollars")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("Year") + ylab("GDP per Capita")
inthom_filt <- df.crime %>% filter(crime=="Intentional homicide rates per 100,000")
inthom_filt_2 <- inthom_filt %>% filter(year %in% c("2005","2010","2018"))
inthom_nc <- inthom_filt_2 %>% filter(country %in% c("Argentina", "Brazil","Canada", "China", "Colombia","El Salvador","France", "Germany", "Greece","Guatemala","India", "Iceland","Italy","Japan", "Kenya","Mexico", "Netherlands","Norway","Philippines", "Russian Federation", "Singapore", "South Africa","Spain", "United States of America"))
inthom_nc$value <- as.numeric(inthom_nc$value)
inthom_nc %>% group_by(country) %>% ggplot(., aes(year,value, color=country))+
geom_line(aes(x=year, y=value, color=country, group=country))+
geom_point(size=.5)+
geom_text(
label=inthom_nc$country,
size=2,
nudge_x=.2, #to add labels, I don't really like them though...
nudge_y = .1,
check_overlap=T)+
ggtitle("Number of Intentional Homicides per 100,000 People")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("Year") + ylab("# of Intentional Homicides")
Data for intentional homicides is more diverse than GDP data, some countries see a positive trend, others see no change, others see a negative trend.
Important to note that the years are not incremented evenly, so data missing between 2010 and 2018 may alter graph.
El Salvador, Guatemala, and Colombia (all Latin American countries) see a decrease from 2010-2018. Mexico and Brazil do not.
gdp_2005 <- gdp_nc %>% filter(year %in% c("2005"))
inthom2005 <- inthom_nc %>% filter(year %in% c("2005"))
ndf <- data.frame(GDP_per_capita1 = gdp_2005$value, value2005 = inthom2005$value)
model <- lm(value2005~GDP_per_capita1,ndf)
plot(ndf)
summary(model)
##
## Call:
## lm(formula = value2005 ~ GDP_per_capita1, data = ndf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.158 -8.757 -3.810 5.786 43.672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.5564661 4.5802028 4.706 0.000107 ***
## GDP_per_capita1 -0.0004647 0.0001594 -2.916 0.008014 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.31 on 22 degrees of freedom
## Multiple R-squared: 0.2787, Adjusted R-squared: 0.2459
## F-statistic: 8.5 on 1 and 22 DF, p-value: 0.008014
ggplot(ndf, aes(x=GDP_per_capita1, y=value2005))+
geom_point()+
stat_smooth(method = "lm", col="red")+
ggtitle("Correlation between GDP and Homicide Rates in 2005")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("GDP per Capita in (US$)") + ylab("# of Intentional Homicides per 100,000 People")
## `geom_smooth()` using formula 'y ~ x'
No clear trend. Just two loose groups of points in vague shape of a log function. Linear model places a line of best fit which shows that as GDP per Capita increases, # of intentional homicide rates decreases slightly. Very weak correlation though and the error bands and quite large, so not really reliable.
gdp_2005 <- gdp_nc %>% filter(year %in% c("2005"))
inthom2005 <- inthom_nc %>% filter(year %in% c("2005"))
ndf <- data.frame(GDP_per_capita1 = log(gdp_2005$value), value2005 = log(inthom2005$value))
model <- lm(value2005~GDP_per_capita1,ndf)
plot(ndf)
summary(model)
##
## Call:
## lm(formula = value2005 ~ GDP_per_capita1, data = ndf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0266 -0.6513 -0.2803 1.3509 1.9338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.1584 1.6325 4.385 0.000236 ***
## GDP_per_capita1 -0.6327 0.1755 -3.605 0.001573 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.26 on 22 degrees of freedom
## Multiple R-squared: 0.3713, Adjusted R-squared: 0.3428
## F-statistic: 12.99 on 1 and 22 DF, p-value: 0.001573
ggplot(ndf, aes(x=GDP_per_capita1, y=value2005))+
geom_point()+
stat_smooth(method = "lm", col="red")+
ggtitle("Log of Correlation between GDP and Homicide Rates in 2005")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("GDP per Capita in (US$)") + ylab("# of Intentional Homicides per 100,000 People")
## `geom_smooth()` using formula 'y ~ x'
gdp_2010 <- gdp_nc %>% filter(year %in% c("2010"))
inthom2010 <- inthom_nc %>% filter(year %in% c("2010"))
ndf <- data.frame(GDP_per_capita2 = gdp_2010$value, value2010 = inthom2010$value)
model <- lm(value2010 ~ GDP_per_capita2, ndf)
plot(ndf)
summary(model)
##
## Call:
## lm(formula = value2010 ~ GDP_per_capita2, data = ndf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.357 -8.480 -3.514 5.200 44.570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.0960333 4.4185154 4.774 9.11e-05 ***
## GDP_per_capita2 -0.0003910 0.0001298 -3.011 0.00643 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.26 on 22 degrees of freedom
## Multiple R-squared: 0.2919, Adjusted R-squared: 0.2597
## F-statistic: 9.068 on 1 and 22 DF, p-value: 0.006425
ggplot(ndf, aes(x=GDP_per_capita2, y=value2010))+
geom_point()+
stat_smooth(method = "lm", col="red")+
ggtitle("Correlation between GDP and Homicide Rates in 2010")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("GDP per Capita in (US$)") + ylab("# of Intentional Homicides per 100,000 People")
## `geom_smooth()` using formula 'y ~ x'
gdp_2018 <- gdp_nc %>% filter(year %in% c("2018"))
inthom2018 <- inthom_nc %>% filter(year %in% c("2018"))
ndf <- data.frame(GDP_per_capita2 = gdp_2010$value, value2010 = inthom2010$value)
model <- lm(value2010 ~ GDP_per_capita2,ndf)
plot(ndf)
summary(model)
##
## Call:
## lm(formula = value2010 ~ GDP_per_capita2, data = ndf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.357 -8.480 -3.514 5.200 44.570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.0960333 4.4185154 4.774 9.11e-05 ***
## GDP_per_capita2 -0.0003910 0.0001298 -3.011 0.00643 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.26 on 22 degrees of freedom
## Multiple R-squared: 0.2919, Adjusted R-squared: 0.2597
## F-statistic: 9.068 on 1 and 22 DF, p-value: 0.006425
ggplot(ndf, aes(x=GDP_per_capita2, y=value2010))+
geom_point()+
stat_smooth(method = "lm", col="red")+
ggtitle("Correlation between GDP and Homicide Rates in 2010")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("GDP per Capita in (US$)") + ylab("# of Intentional Homicides per 100,000 People")
## `geom_smooth()` using formula 'y ~ x'
gdp_filt2 <- df.gdp %>% filter(series==
"GDP per capita (US dollars)")
gdp_cnt <- gdp_filt2 %>% filter(country %in% c("Albania", "Canada", "Colombia","El Salvador","France", "Germany", "Greece","Guatemala","India", "Iceland","Italy","Japan", "Kenya","Mexico", "Netherlands", "Nigeria", "Norway","Peru", "Philippines", "Russian Federation", "Singapore", "Turkey", "Ukraine", "United States of America"))
gdp_cnt$value <- as.numeric(gdp_cnt$value)
gdp_cnt %>% ggplot(., aes(year,value, color=country))+
geom_line()+
geom_point()+
ggtitle("GDP per Capita in US Dollars")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("Year") + ylab("GDP per Capita")
theft_filt <- df.crime %>% filter(crime=="Theft at the national level, rate per 100,000 population")
theft_filt_2 <- theft_filt %>% filter(year %in% c("2005", "2010","2017","2018"))
theft_nc <- theft_filt_2 %>% filter(country %in% c("Albania", "Canada", "Colombia","El Salvador","France", "Germany", "Greece","Guatemala","India", "Iceland","Italy","Japan", "Kenya","Mexico", "Netherlands", "Nigeria", "Norway","Peru", "Philippines", "Russian Federation", "Singapore", "Turkey", "Ukraine", "United States of America"))
theft_nc$value <- as.numeric(theft_nc$value)
theft_nc %>% group_by(country) %>% ggplot(., aes(year,value, color=country))+
geom_line(aes(x=year, y=value, color=country, group=country))+
geom_point(size=1)+
ggtitle("Number of Thefts per 100,000 People")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("Year") + ylab("# of Thefts")
Overall, negative trend in the data for all countries from 2005-2018.
gdp2010 <- gdp_cnt %>% filter(year %in% c("2010"))
theft2010 <- theft_nc %>% filter(year %in% c("2010"))
tdf <- data.frame(GDP_per_capita4 = gdp2010$value, value2010 = theft2010$value)
model <- lm(value2010 ~ GDP_per_capita4,tdf)
plot(tdf)
summary(model)
##
## Call:
## lm(formula = value2010 ~ GDP_per_capita4, data = tdf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1301.90 -116.98 -61.88 143.69 2174.00
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.283e+01 1.839e+02 0.342 0.736
## GDP_per_capita4 3.413e-02 5.508e-03 6.197 3.09e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 626.3 on 22 degrees of freedom
## Multiple R-squared: 0.6358, Adjusted R-squared: 0.6192
## F-statistic: 38.4 on 1 and 22 DF, p-value: 3.086e-06
ggplot(tdf, aes(x=GDP_per_capita4, y=value2010))+
geom_point()+
stat_smooth(method = "lm", col="red")+
ggtitle("Correlation between GDP and Theft in 2010")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("GDP per Capita in (US$)") + ylab("# of Thefts per 100,000 People")
## `geom_smooth()` using formula 'y ~ x'
Weak positive correlation, number of data points could be increased for less error but since using two different data sets (combined to create the lm) very difficult to effectively filter out the countries that have data available for each variable and year.
#install.packages("ClusterR")
library(ClusterR)
## Warning: package 'ClusterR' was built under R version 4.1.3
## Loading required package: gtools
## Warning: package 'gtools' was built under R version 4.1.3
library(cluster)
## Warning: package 'cluster' was built under R version 4.1.3
df_clas2005 <- data.frame(
gdpk2005 = gdp_2005$value[1:24],
homsk2005 = inthom2005$value[1:24]
)
set.seed(10032001)
kmeans.re <- kmeans(df_clas2005[-17,], centers=3)
kmeans.re
## K-means clustering with 3 clusters of sizes 2, 9, 12
##
## Cluster means:
## gdpk2005 homsk2005
## 1 61838.00 0.850000
## 2 33056.67 1.655556
## 3 3423.75 22.116667
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 18 19 20 21 22 23 24
## 3 3 2 3 3 3 2 2 2 3 1 3 2 2 3 3 1 3 3 2 3 2 2
##
## Within cluster sum of squares by cluster:
## [1] 46889928 336690834 62213224
## (between_SS / total_SS = 94.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
plot(df_clas2005[c("gdpk2005","homsk2005")],col=kmeans.re$cluster, main="2005 Clusters with means as centroids")
points(kmeans.re$centers[,
c("gdpk2005","homsk2005")],
col=1:3,
bg=1:3,
pch=22,
cex=1)
df_clas2010 <- data.frame(
#country = gdp_2010$country[1:24],
gdp_per_capita = gdp2010$value[1:24],
homicides = inthom2010$value[1:24],
thefts = theft2010$value[1:24]
)
plot(df_clas2010)
df_clas2010 <- data.frame(
#country = gdp_2010$country[1:24],
gdp_per_capita = gdp_2010$value[1:24],
homicides = inthom2010$value[1:24],
thefts = theft2010$value[1:24]
)
plot(df_clas2010)
set.seed(10032001)
kmeans.re <- kmeans(df_clas2010[-17,], centers=4)
kmeans.re
## K-means clustering with 4 clusters of sizes 1, 6, 6, 10
##
## Cluster means:
## gdp_per_capita homicides thefts
## 1 87754.000 0.60000 192.30
## 2 9230.667 21.21667 771.20
## 3 2440.000 20.63333 1036.65
## 4 40588.900 1.35000 752.79
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 18 19 20 21 22 23 24
## 2 2 4 3 2 3 4 4 4 3 4 3 4 4 3 2 1 3 2 4 2 4 4
##
## Within cluster sum of squares by cluster:
## [1] 0 23032221 19782335 477399447
## (between_SS / total_SS = 95.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
plot(df_clas2010[c("homicides","thefts")],col=kmeans.re$cluster, main="K Clusters with means as centroids")
points(kmeans.re$centers[,
c("homicides","thefts")],
col=1:4,
bg=1:4,
pch=21,
cex=1)
plot(df_clas2010[c("gdp_per_capita","thefts")],col=kmeans.re$cluster, main="K Clusters with means as centroids")
points(kmeans.re$centers[,
c("gdp_per_capita","thefts")],
col=1:4,
bg=1:4,
pch=21,
cex=1)
Makes sense that the algorithm would be able to find a better pattern to cluster with gdp vs theft since most correlation.
plot(df_clas2010[c("gdp_per_capita","homicides")],col=kmeans.re$cluster, main="K Clusters with means as centroids")
points(kmeans.re$centers[,
c("gdp_per_capita","homicides")],
col=1:4,
bg=1:4,
pch=21,
cex=1)
Unsupervised algorithm, so there are no initial classes within the original dataset and thus, nothing to test the K cluster against. Upon observation, K Clustering gives overlapping clusters. I am assuming that since there is a weak-to-no correlation between hom vs gdp and hom vs thefts, the clusters are less defined (because theft vs gdp clusters are more defined and less overlap).
Although there was a lot learned I feel the data set really limited what was possible code-wise. -missing data -inconsistency within one dataset (ex a country would have theft stats from 2005 and 2018 but not from 2010, or a country would ### have homicide or assault data from 2010 but not theft data etc) -two data sets could not be binded -mismatched dimensions -limited variables due to missing data
With more in depth work, could manipulate other variables in original dataset to have enough variables and combining with HDI set (for classes) could have created a good decision tree, but its over thousands of observations in both dataset and many missing data.
Positive Correlation between theft and gdp was the most shocking to me. Among the approximately 25 countries I analysed, as gdp per capita increased theft also increased.