rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 494409 26.5 940480 50.3 750400 40.1
## Vcells 934606 7.2 1650153 12.6 1176241 9.0
library(tidyverse) #visualizeing, transforming, inputing, tidying and joining data
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.1 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(readxl)
library(plotly) #showing plots and data at the same time
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggmap)
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
##
## wind
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(mapdata)
library(DT) #previewing the data sets
library(ggrepel) #avoid label overlapping
library(car) #testing multicollinearity and heteroscedasticity
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.4.4
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(modelr) #testing if the model can be applied generally
library(scales) #formatting scales to non-scientific type
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(gridExtra) #putting multiple plots into 1 page
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(neuralnet) #training of Neural Networks
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
wages<- read.csv(file="mwe_2016complete.csv", header=TRUE, sep=",")
head(wages, 20)
## Series.ID
## 1 WMU00000001020000001100000006
## 2 WMU00000001020000001100000007
## 3 WMU00000001020000001100000008
## 4 WMU00000001020000001100000009
## 5 WMU00000001020000001100000010
## 6 WMU00000001020000001100000011
## 7 WMU00000001020000001100000012
## 8 WMU00000001020000001100000013
## 9 WMU00000001020000001100000016
## 10 WMU00000001020000001110210008
## 11 WMU00000001020000001110210010
## 12 WMU00000001020000001110210011
## 13 WMU00000001020000001110210012
## 14 WMU00000001020000001110210013
## 15 WMU00000001020000001110210016
## 16 WMU00000001020000001120210009
## 17 WMU00000001020000001120210010
## 18 WMU00000001020000001120210011
## 19 WMU00000001020000001120210012
## 20 WMU00000001020000001120210016
## Series.Title
## 1 Hourly mean wage for management occupations, for all US, Level 06
## 2 Hourly mean wage for management occupations, for all US, Level 07
## 3 Hourly mean wage for management occupations, for all US, Level 08
## 4 Hourly mean wage for management occupations, for all US, Level 09
## 5 Hourly mean wage for management occupations, for all US, Level 10
## 6 Hourly mean wage for management occupations, for all US, Level 11
## 7 Hourly mean wage for management occupations, for all US, Level 12
## 8 Hourly mean wage for management occupations, for all US, Level 13
## 9 Hourly mean wage for management occupations, for all US, not able to level
## 10 Hourly mean wage for general and operations managers, for all US, Level 08
## 11 Hourly mean wage for general and operations managers, for all US, Level 10
## 12 Hourly mean wage for general and operations managers, for all US, Level 11
## 13 Hourly mean wage for general and operations managers, for all US, Level 12
## 14 Hourly mean wage for general and operations managers, for all US, Level 13
## 15 Hourly mean wage for general and operations managers, for all US, not able to level
## 16 Hourly mean wage for marketing managers, for all US, Level 09
## 17 Hourly mean wage for marketing managers, for all US, Level 10
## 18 Hourly mean wage for marketing managers, for all US, Level 11
## 19 Hourly mean wage for marketing managers, for all US, Level 12
## 20 Hourly mean wage for marketing managers, for all US, not able to level
## Average.Hourly.Wage Area.Code Area.Level Area.Text Occupation.Code
## 1 16.02 0 National National 110000
## 2 22.65 0 National National 110000
## 3 25.86 0 National National 110000
## 4 34.61 0 National National 110000
## 5 40.20 0 National National 110000
## 6 52.91 0 National National 110000
## 7 68.79 0 National National 110000
## 8 85.22 0 National National 110000
## 9 65.23 0 National National 110000
## 10 24.10 0 National National 111021
## 11 36.27 0 National National 111021
## 12 55.80 0 National National 111021
## 13 61.58 0 National National 111021
## 14 77.03 0 National National 111021
## 15 70.49 0 National National 111021
## 16 40.27 0 National National 112021
## 17 43.09 0 National National 112021
## 18 56.17 0 National National 112021
## 19 73.05 0 National National 112021
## 20 76.92 0 National National 112021
## Occupation.Text Job.Characteristic.Code
## 1 Management Occupations 0
## 2 Management Occupations 0
## 3 Management Occupations 0
## 4 Management Occupations 0
## 5 Management Occupations 0
## 6 Management Occupations 0
## 7 Management Occupations 0
## 8 Management Occupations 0
## 9 Management Occupations 0
## 10 General and Operations Managers 0
## 11 General and Operations Managers 0
## 12 General and Operations Managers 0
## 13 General and Operations Managers 0
## 14 General and Operations Managers 0
## 15 General and Operations Managers 0
## 16 Marketing Managers 0
## 17 Marketing Managers 0
## 18 Marketing Managers 0
## 19 Marketing Managers 0
## 20 Marketing Managers 0
## Job.Characteristic.Text Work.Level.Code Work.Level.Text
## 1 All workers 6 Level 06
## 2 All workers 7 Level 07
## 3 All workers 8 Level 08
## 4 All workers 9 Level 09
## 5 All workers 10 Level 10
## 6 All workers 11 Level 11
## 7 All workers 12 Level 12
## 8 All workers 13 Level 13
## 9 All workers 16 Not able to be leveled
## 10 All workers 8 Level 08
## 11 All workers 10 Level 10
## 12 All workers 11 Level 11
## 13 All workers 12 Level 12
## 14 All workers 13 Level 13
## 15 All workers 16 Not able to be leveled
## 16 All workers 9 Level 09
## 17 All workers 10 Level 10
## 18 All workers 11 Level 11
## 19 All workers 12 Level 12
## 20 All workers 16 Not able to be leveled
summary(wages)
## Series.ID
## WMU00000001020000001100000006: 1
## WMU00000001020000001100000007: 1
## WMU00000001020000001100000008: 1
## WMU00000001020000001100000009: 1
## WMU00000001020000001100000010: 1
## WMU00000001020000001100000011: 1
## (Other) :255473
## Series.Title
## Hourly mean wage for accountants and auditors, for all US, Level 06: 1
## Hourly mean wage for accountants and auditors, for all US, Level 07: 1
## Hourly mean wage for accountants and auditors, for all US, Level 08: 1
## Hourly mean wage for accountants and auditors, for all US, Level 09: 1
## Hourly mean wage for accountants and auditors, for all US, Level 10: 1
## Hourly mean wage for accountants and auditors, for all US, Level 11: 1
## (Other) :255473
## Average.Hourly.Wage Area.Code Area.Level
## Min. : 8.26 Min. : 0 Metro area :165981
## 1st Qu.:12.31 1st Qu.: 24860 National : 3386
## Median :16.84 Median : 41100 Nonmetro area: 64329
## Mean :20.04 Mean :1054035 State area : 21783
## 3rd Qu.:24.31 3rd Qu.:1800003
## Max. :99.88 Max. :5600004
##
## Area.Text Occupation.Code
## National : 3386 Min. :110000
## Ohio : 804 1st Qu.:310000
## Columbus, OH : 775 Median :410000
## Cleveland-Elyria, OH : 769 Mean :377382
## Indianapolis-Carmel-Anderson, IN: 764 3rd Qu.:436014
## North Carolina : 762 Max. :537199
## (Other) :248219
## Occupation.Text
## Office and Administrative Support Occupations : 16856
## Sales and Related Occupations : 13860
## Healthcare Practitioners and Technical Occupations: 13274
## Transportation and Material Moving Occupations : 12613
## Production Occupations : 12513
## Education, Training, and Library Occupations : 9912
## (Other) :176451
## Job.Characteristic.Code Job.Characteristic.Text Work.Level.Code
## Min. : 0.00 All workers :107059 Min. : 0.00
## 1st Qu.: 0.00 Full-time : 74120 1st Qu.: 1.00
## Median :24.00 Incentive-based pay: 4329 Median : 3.00
## Mean :14.62 Nonunion : 12684 Mean : 4.83
## 3rd Qu.:25.00 Part-time : 40274 3rd Qu.: 7.00
## Max. :28.00 Time-based pay : 4329 Max. :16.00
## Union : 12684
## Work.Level.Text
## All levels :63658
## Not able to be leveled:27933
## Level 03 :27287
## Level 02 :24440
## Level 04 :23514
## Level 05 :17002
## (Other) :71645
#The histogram of the data shows that the distrubution on the data points of the Average Hourly Wage is skewed to the right and distributed unevenly.
hist(wages$Average.Hourly.Wage, col="green")
#looking at the min and max values of the data
apply(wages,2 , range)
## Series.ID
## [1,] "WMU00000001020000001100000006"
## [2,] "WMU56000041020000005370622601"
## Series.Title
## [1,] "Hourly mean wage for accountants and auditors, for all US, Level 06"
## [2,] "Hourly mean wage for word processors and typists, union, for all US"
## Average.Hourly.Wage Area.Code Area.Level Area.Text
## [1,] " 8.26" " 0" "Metro area" "Abilene, TX"
## [2,] "99.88" "5600004" "State area" "Yuma, AZ"
## Occupation.Code Occupation.Text Job.Characteristic.Code
## [1,] "110000" "Accountants and Auditors" " 0"
## [2,] "537199" "Word Processors and Typists" "28"
## Job.Characteristic.Text Work.Level.Code Work.Level.Text
## [1,] "All workers" " 0" "All levels"
## [2,] "Union" "16" "Not able to be leveled"
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
library(tidyr)
library(dplyr)
library(knitr)
api_key<-"iiHnOKfno2Mgkt5AynpvPpUQTEyxE77jo1RU8PIv"
url <-"https://api.usa.gov/crime/fbi/ucr/estimates/national?page=1&per_page=100&output=json&api_key="
api_url= paste(url,api_key,sep="")
api_url
## [1] "https://api.usa.gov/crime/fbi/ucr/estimates/national?page=1&per_page=100&output=json&api_key=iiHnOKfno2Mgkt5AynpvPpUQTEyxE77jo1RU8PIv"
FBI Crime Data API
#Read JSON using API key
crimes<- fromJSON(api_url, flatten = TRUE)
#Convert JSON into data.frame
crimes<- data.frame(crimes)
#review the column names
colnames(crimes)
## [1] "results.rape_revised" "results.motor_vehicle_theft"
## [3] "results.violent_crime" "results.larceny"
## [5] "results.rape_legacy" "results.homicide"
## [7] "results.population" "results.property_crime"
## [9] "results.caveats" "results.burglary"
## [11] "results.aggravated_assault" "results.robbery"
## [13] "results.year" "pagination.pages"
## [15] "pagination.count" "pagination.page"
## [17] "pagination.per_page"
apply(crimes, 2, range)
## results.rape_revised results.motor_vehicle_theft
## [1,] NA " 686803"
## [2,] NA "1472441"
## results.violent_crime results.larceny results.rape_legacy
## [1,] "1153022" "5638455" " 82109"
## [2,] "1798792" "7997710" "130603"
## results.homicide results.population results.property_crime
## [1,] "14164" "262803276" " 7919035"
## [2,] "21606" "323127513" "12063935"
## results.caveats results.burglary results.aggravated_assault
## [1,] NA "1515096" " 726777"
## [2,] NA "2593784" "1099207"
## results.robbery results.year pagination.pages pagination.count
## [1,] "322905" "1995" "1" "22"
## [2,] "580509" "2016" "1" "22"
## pagination.page pagination.per_page
## [1,] "1" "100"
## [2,] "1" "100"
head(crimes)
## results.rape_revised results.motor_vehicle_theft results.violent_crime
## 1 NA 1472441 1798792
## 2 NA 1394238 1688540
## 3 NA 1354189 1636099
## 4 NA 1242781 1533887
## 5 NA 1152075 1426044
## 6 NA 1160002 1425486
## results.larceny results.rape_legacy results.homicide results.population
## 1 7997710 97470 21606 262803276
## 2 7904685 96252 19645 265228572
## 3 7743760 96153 18211 267783607
## 4 7376311 93144 16974 270248003
## 5 6955520 89411 15522 272690813
## 6 6971590 90178 15586 281421906
## results.property_crime results.caveats results.burglary
## 1 12063935 <NA> 2593784
## 2 11805323 <NA> 2506400
## 3 11558975 <NA> 2461026
## 4 10951827 <NA> 2332735
## 5 10208334 <NA> 2100739
## 6 10182584 <NA> 2050992
## results.aggravated_assault results.robbery results.year pagination.pages
## 1 1099207 580509 1995 1
## 2 1037049 535594 1996 1
## 3 1023201 498534 1997 1
## 4 976583 447186 1998 1
## 5 911740 409371 1999 1
## 6 911706 408016 2000 1
## pagination.count pagination.page pagination.per_page
## 1 22 1 100
## 2 22 1 100
## 3 22 1 100
## 4 22 1 100
## 5 22 1 100
## 6 22 1 100
crimes_clean <- crimes[, -c(1,9,14,15,16,17)]
head(crimes_clean)
## results.motor_vehicle_theft results.violent_crime results.larceny
## 1 1472441 1798792 7997710
## 2 1394238 1688540 7904685
## 3 1354189 1636099 7743760
## 4 1242781 1533887 7376311
## 5 1152075 1426044 6955520
## 6 1160002 1425486 6971590
## results.rape_legacy results.homicide results.population
## 1 97470 21606 262803276
## 2 96252 19645 265228572
## 3 96153 18211 267783607
## 4 93144 16974 270248003
## 5 89411 15522 272690813
## 6 90178 15586 281421906
## results.property_crime results.burglary results.aggravated_assault
## 1 12063935 2593784 1099207
## 2 11805323 2506400 1037049
## 3 11558975 2461026 1023201
## 4 10951827 2332735 976583
## 5 10208334 2100739 911740
## 6 10182584 2050992 911706
## results.robbery results.year
## 1 580509 1995
## 2 535594 1996
## 3 498534 1997
## 4 447186 1998
## 5 409371 1999
## 6 408016 2000
head(crimes_clean)
## results.motor_vehicle_theft results.violent_crime results.larceny
## 1 1472441 1798792 7997710
## 2 1394238 1688540 7904685
## 3 1354189 1636099 7743760
## 4 1242781 1533887 7376311
## 5 1152075 1426044 6955520
## 6 1160002 1425486 6971590
## results.rape_legacy results.homicide results.population
## 1 97470 21606 262803276
## 2 96252 19645 265228572
## 3 96153 18211 267783607
## 4 93144 16974 270248003
## 5 89411 15522 272690813
## 6 90178 15586 281421906
## results.property_crime results.burglary results.aggravated_assault
## 1 12063935 2593784 1099207
## 2 11805323 2506400 1037049
## 3 11558975 2461026 1023201
## 4 10951827 2332735 976583
## 5 10208334 2100739 911740
## 6 10182584 2050992 911706
## results.robbery results.year
## 1 580509 1995
## 2 535594 1996
## 3 498534 1997
## 4 447186 1998
## 5 409371 1999
## 6 408016 2000
MaxValue<-apply(crimes_clean, 2, max)
MinValue<-apply(crimes_clean, 2, min)
length(MinValue)==ncol(crimes)
## [1] FALSE
crimesdf<-as.data.frame(scale(crimes_clean, center = MinValue, scale = MaxValue- MinValue))
ind<-sample(1:nrow(crimesdf),8)
traindf<-crimesdf[ind,]
testdf<-crimesdf[-ind,]
plot(neuralModel)
predictions<- compute(neuralModel,testdf [,1:10])
str(predictions)
## List of 2
## $ neurons :List of 3
## ..$ : num [1:14, 1:11] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:14] "2" "3" "5" "7" ...
## .. .. ..$ : chr [1:11] "1" "results.motor_vehicle_theft" "results.violent_crime" "results.larceny" ...
## ..$ : num [1:14, 1:5] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:14] "2" "3" "5" "7" ...
## .. .. ..$ : NULL
## ..$ : num [1:14, 1:3] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:14] "2" "3" "5" "7" ...
## .. .. ..$ : NULL
## $ net.result: num [1:14, 1] 0.451 0.507 0.441 0.37 0.324 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:14] "2" "3" "5" "7" ...
## .. ..$ : NULL
predictions<- predictions$net.result*(max(testdf$results.homicide)-min(testdf$results.homicide))+min(testdf$results.homicide)
actualValues<- (testdf$results.homicide*(max(testdf$results.homicide)-min(testdf$results.homicide))+min(testdf$results.homicide))
MSE<- sum((predictions - actualValues)^2)/ nrow(testdf)
MSE
## [1] 0.01376197041