Data 608 Final

In this project educational investment in high income countries and crime index of year 2019 is going to be analyzed.

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(rvest)

## 
## Attaching package: 'rvest'

## The following object is masked from 'package:readr':
## 
##     guess_encoding

library(XML)
library(dplyr)
library(shiny)
library("wesanderson")

rsconnect::setAccountInfo(name='maliati',token='6362933A3F7D3E0E3D665BE0FF6F9878',secret='0vsr2VidAXUthdEM+J+UDpce0oIp1JfM2XrMtQGL')

Data Wrangling

worldcrime<-read_html("https://www.numbeo.com/crime/rankings_by_country.jsp?title=2019")

crimetable<-worldcrime %>% html_table(fill = TRUE)

crimtab2<-crimetable[[2]]

dataIncomeGroup <- read.csv("https://raw.githubusercontent.com/maliat-hossain/FileProcessing/main/Metadata_Country_API_NY.GDP.MKTP.CD_DS2_en_csv_v2_3930485.csv")

Dataedu<- read.csv("https://raw.githubusercontent.com/maliat-hossain/FileProcessing/main/World%20Education%20Spending%20Data.csv")

IncomebyCountry<- select(dataIncomeGroup,c(3,5))

CrimeIndex<- select(crimtab2,c(2,3))

view(Dataedu)

IncomebyCountry <- IncomebyCountry %>%
  rename(Country = TableName)

colnames(Dataedu) <- c("Country", "Recent Year","GDP Spending Percentage")

IncomeEdu<-inner_join(IncomebyCountry, Dataedu, by = "Country")

IncomeEduCrime<-inner_join(IncomeEdu,CrimeIndex, by = "Country")

#colnames(IncomeEduCrime)[3] <- "Education Investment Percentage in 2019"

Filtering by High Income Countries

IncomeEduCrime1<-filter(IncomeEduCrime, IncomeEduCrime$IncomeGroup == "High income")

colnames(IncomeEduCrime1)[5] <- "Crime Index of Year 2019"

scatter Plot

library(ggplot2)
library(hrbrthemes)

## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.

##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and

##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow

# linear trend + confidence interval
p<- ggplot(IncomeEduCrime1, aes(x=IncomeEduCrime1$`GDP Spending Percentage`, y=IncomeEduCrime1$`Crime Index of Year 2019`)) +
  geom_point() +
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
  theme_ipsum()

## Warning: Use of `IncomeEduCrime1$`GDP Spending Percentage`` is discouraged. Use
## `GDP Spending Percentage` instead.

## Warning: Use of `IncomeEduCrime1$`Crime Index of Year 2019`` is discouraged. Use
## `Crime Index of Year 2019` instead.

## Warning: Use of `IncomeEduCrime1$`GDP Spending Percentage`` is discouraged. Use
## `GDP Spending Percentage` instead.

## Warning: Use of `IncomeEduCrime1$`Crime Index of Year 2019`` is discouraged. Use
## `Crime Index of Year 2019` instead.

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

Transforming Numeric Values to Categorical

# mean value removing NA
 IncomeEduCrime1<-na.omit(IncomeEduCrime1)

mean(IncomeEduCrime1$`GDP Spending Percentage`)

## [1] 5.029286

mean(IncomeEduCrime1$`Crime Index of Year 2019`)

## [1] 33.31214

#IncomeEduCrime1 %>% add_row(IncomeGroup = "High income",Country = "United States", `Recent Year` = "2020",`GDP Spending Percentage`= "4.96", `Crime Index of Year 2019`= "47.13")
IncomeEduCrime1[nrow(IncomeEduCrime1) + 1,] <- c("High income", "United States","2020", 4.96,47.13)

#IncomeEduCrime4<-IncomeEduCrime1

library(arules)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## 
## Attaching package: 'arules'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

#IncomeEduCrime4$`Crime Index of Year 2019`<-arules::discretize(IncomeEduCrime4$`Crime Index of Year 2019`, breaks = 3, labels = c("Low Crime Rate","Medium Crime Rate","High Crime Rate"))
#IncomeEduCrime4$`GDP Spending Percentage`<-arules::discretize(IncomeEduCrime4$`GDP Spending Percentage`, breaks = 3, labels = c("Low Education Spending","Medium Education Spending","High Education Spending"))

DataRede<- IncomeEduCrime1
DataRede$`GDP Spending Percentage Type` <- as.factor(ifelse(DataRede$`GDP Spending Percentage`>= 5.02, "High Spending", "Low Spending"))
DataRede$`Crime Index of Year 2019 Type` <- as.factor(ifelse(DataRede$`Crime Index of Year 2019`>= 33.31, "High Crime Rate", "Low Crime Rate"))

#DataRede$`Crime Index of Year 2019`<-arules::discretize(DataRede$`Crime Index of Year 2019`, breaks = 2, labels = c("Low Crime Rate","Medium Crime Rate","High Crime Rate"))
#IncomeEduCrime4$`GDP Spending Percentage`<-arules::discretize(IncomeEduCrime4$`GDP Spending Percentage`, breaks = 3, labels = c("Low Education Spending","Medium Education Spending","High Education Spending"))

Data Visualization:

ggplot(DataRede, aes( x = DataRede$Country, y = DataRede$`Crime Index of Year 2019 Type`, fill = DataRede$`GDP Spending Percentage Type`) ) +    # print bar chart
  geom_bar( stat = 'identity', position = 'dodge' )+
scale_x_discrete(guide = guide_axis(n.dodge=2))+
  scale_fill_manual(values = wes_palette("GrandBudapest2"))+coord_flip()

Data 608 Final

Maliat I

4/23/2022

In this project educational investment in high income countries and crime index of year 2019 is going to be analyzed.

Data Wrangling

Filtering by High Income Countries

scatter Plot

Transforming Numeric Values to Categorical

Data Visualization: