india webscraping

First, the following required packages are loaded and the working directory is set as follows.

library(jsonlite)

## Warning: package 'jsonlite' was built under R version 3.2.3

library(httr)

## Warning: package 'httr' was built under R version 3.2.3

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.2.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.2.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.2.4

library(maptools)

## Warning: package 'maptools' was built under R version 3.2.3

## Loading required package: sp

## Warning: package 'sp' was built under R version 3.2.3

## Checking rgeos availability: FALSE
##      Note: when rgeos is not available, polygon geometry     computations in maptools depend on gpclib,
##      which has a restricted licence. It is disabled by default;
##      to enable gpclib, type gpclibPermit()

library(gpclib)

## General Polygon Clipper Library for R (version 1.5-5)
##  Type 'class ? gpc.poly' for help

library(RColorBrewer)
library(maps)

## Warning: package 'maps' was built under R version 3.2.3

## 
##  # maps v3.1: updated 'world': all lakes moved to separate new #
##  # 'lakes' database. Type '?world' or 'news(package="maps")'.  #

setwd("/Users/chittampalliyashaswini/Desktop/Yadu")

Next, the provincial data for the GINI coefficient of India is scraped as follows and then spread out. Some of the spellings of the names of the provinces have to be changed so that they will match the names listed in the shape file.

dat.india.province = subset(fromJSON("https://knoema.com/api/1.0/data/wiwuiff?Time=2005-2005&region=1000130,1000020,1000040,1000050,1000060,1000080,1000090,1000100,1000110,1000120,1000140,1000150,1000160,1000220,1000210,1000230,1000290,1000280,1000270,1000250&variable=1000130,1000140,1000070,1000080&Frequencies=A")$data, select = -c(Unit, Time, RegionId, Frequency, Scale))
dat.india.province = data.frame(spread(dat.india.province, variable, Value))
colnames(dat.india.province)[2:5] = c("Ruralization (Percentage)", "Urbanization (Percentage)", "RuralGini", "UrbanGini")
dat.india.province$region[8] = "Jammu and Kashmir"
dat.india.province$region[14] = "Odisha"

Because this dataset does not include any information for Telangana, which was formed only recently, we can just subset the data so that we get data only for the province of Andhra Pradesh and just apply it to Telangana and then order the states in alphabetic order.

dat.telangana = data.frame(region = 'Telangana', subset(dat.india.province, region == "Andhra Pradesh", select = -c(region)))
colnames(dat.telangana) = colnames(dat.india.province)
dat.india.province = rbind(dat.india.province, dat.telangana)
dat.india.province = dat.india.province[order(dat.india.province$region),]

Now, for each province, we compute the GINI coefficient by weighting the rural GINI coefficient with the percentage of the rural population and the urban GINI coefficient with the percentage of the urban population.

dat.india.province$GINI = ((dat.india.province$RuralGini)*(dat.india.province$Ruralization) + (dat.india.province$UrbanGini)*(dat.india.province$Urbanization))/100

The following are linear correlation plots between urbanization and GINI coefficient and ruralization and GINI coefficient for each province.

ggplot(dat.india.province, aes(x = `Urbanization (Percentage)`, y = GINI)) + geom_point(color = "red") + geom_smooth(method = "lm")

lm(GINI ~ `Urbanization (Percentage)`, data = dat.india.province)

## 
## Call:
## lm(formula = GINI ~ `Urbanization (Percentage)`, data = dat.india.province)
## 
## Coefficients:
##                 (Intercept)  `Urbanization (Percentage)`  
##                    0.037315                     0.006261

summary(lm(GINI ~ `Urbanization (Percentage)`, data = dat.india.province))

## 
## Call:
## lm(formula = GINI ~ `Urbanization (Percentage)`, data = dat.india.province)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.055301 -0.019227 -0.003567  0.023605  0.066789 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 0.0373150  0.0204036   1.829   0.0832 .  
## `Urbanization (Percentage)` 0.0062608  0.0007829   7.997 1.69e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03048 on 19 degrees of freedom
## Multiple R-squared:  0.7709, Adjusted R-squared:  0.7589 
## F-statistic: 63.95 on 1 and 19 DF,  p-value: 1.686e-07

ggplot(dat.india.province, aes(x = `Ruralization (Percentage)`, y = GINI)) + geom_point(color = "red") + geom_smooth(method = "lm")

lm(GINI ~ `Ruralization (Percentage)`, data = dat.india.province)

## 
## Call:
## lm(formula = GINI ~ `Ruralization (Percentage)`, data = dat.india.province)
## 
## Coefficients:
##                 (Intercept)  `Ruralization (Percentage)`  
##                    0.023372                     0.004427

summary(lm(GINI ~ `Ruralization (Percentage)`, data = dat.india.province))

## 
## Call:
## lm(formula = GINI ~ `Ruralization (Percentage)`, data = dat.india.province)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.055002 -0.014365  0.007769  0.014592  0.037581 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 0.0233720  0.0187340   1.248    0.227    
## `Ruralization (Percentage)` 0.0044273  0.0004687   9.446 1.31e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02668 on 19 degrees of freedom
## Multiple R-squared:  0.8244, Adjusted R-squared:  0.8152 
## F-statistic: 89.23 on 1 and 19 DF,  p-value: 1.308e-08

ggplot(dat.india.province, aes(x = `Ruralization (Percentage)`, y = GINI)) + geom_point(color = "red") + geom_smooth(method = "lm")

lm(GINI ~ `Ruralization (Percentage)`, data = dat.india.province)

## 
## Call:
## lm(formula = GINI ~ `Ruralization (Percentage)`, data = dat.india.province)
## 
## Coefficients:
##                 (Intercept)  `Ruralization (Percentage)`  
##                    0.023372                     0.004427

summary(lm(GINI ~ `Ruralization (Percentage)`, data = dat.india.province))

## 
## Call:
## lm(formula = GINI ~ `Ruralization (Percentage)`, data = dat.india.province)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.055002 -0.014365  0.007769  0.014592  0.037581 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 0.0233720  0.0187340   1.248    0.227    
## `Ruralization (Percentage)` 0.0044273  0.0004687   9.446 1.31e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02668 on 19 degrees of freedom
## Multiple R-squared:  0.8244, Adjusted R-squared:  0.8152 
## F-statistic: 89.23 on 1 and 19 DF,  p-value: 1.308e-08

dat.india.province1974 = subset(fromJSON("https://knoema.com/api/1.0/data/wiwuiff?Time=1974-1974&region=1000130,1000020,1000030,1000040,1000050,1000060,1000070,1000080,1000090,1000100,1000110,1000120,1000140,1000150,1000160,1000170,1000180,1000190,1000200,1000210,1000220,1000230,1000240,1000250,1000260,1000270,1000280,1000290&variable=1000070,1000080,1000140,1000130&Frequencies=A")$data, select = c(variable, region, Value))
dat.india.province1974 = data.frame(spread(dat.india.province1974, variable, Value))
dat.india.province1974$region[7] = "Jammu and Kashmir"
dat.india.province1974$region[12] = "Odisha"
colnames(dat.india.province1974)[2:3] = c("RuralGini1974", "UrbanGini1974")

dat.telangana1974 = data.frame(region = 'Telangana', subset(dat.india.province1974, region == "Andhra Pradesh", select = -c(region)))
colnames(dat.telangana1974) = colnames(dat.india.province1974)
dat.india.province1974 = rbind(dat.india.province1974, dat.telangana1974)
dat.india.province1974 = dat.india.province1974[order(dat.india.province1974$region),]

dat.chhattisgarh1974 = data.frame(region = 'Chhattisgarh', subset(dat.india.province1974, region == "Madhya Pradesh", select = -c(region))) 
colnames(dat.chhattisgarh1974) = colnames(dat.india.province1974)
dat.india.province1974 = rbind(dat.india.province1974, dat.chhattisgarh1974)
dat.india.province1974 = dat.india.province1974[order(dat.india.province1974$region),]

dat.jharkhand1974 = data.frame(region = 'Jharkhand', subset(dat.india.province1974, region == "Bihar", select = -c(region))) 
colnames(dat.jharkhand1974) = colnames(dat.india.province1974)
dat.india.province1974 = rbind(dat.india.province1974, dat.jharkhand1974)
dat.india.province1974 = dat.india.province1974[order(dat.india.province1974$region),]

dat.uttarakhand1974 = data.frame(region = 'Uttarakhand', subset(dat.india.province1974, region == "Uttar Pradesh", select = -c(region))) 
colnames(dat.uttarakhand1974) = colnames(dat.india.province1974)
dat.india.province1974 = rbind(dat.india.province1974, dat.uttarakhand1974)
dat.india.province1974 = dat.india.province1974[order(dat.india.province1974$region),]

dat.india.provincecomp = merge(dat.india.province, dat.india.province1974, by = "region")
dat.india.provincecomp$`Rural Gini Coefficient Percent Change` = (abs(dat.india.provincecomp$RuralGini-dat.india.provincecomp$RuralGini1974)/dat.india.provincecomp$RuralGini)*100
dat.india.provincecomp$`Urban Gini Coefficient Percent Change` = (abs(dat.india.provincecomp$UrbanGini-dat.india.provincecomp$UrbanGini1974)/dat.india.provincecomp$UrbanGini)*100

Now, we read in the shapefile for India and plot the map of India and color each province according to its GINI coefficient.

gpclibPermit()

## Warning in gpclibPermit(): support for gpclib will be withdrawn from
## maptools at the next major release

## [1] TRUE

gpclibPermitStatus()

## [1] TRUE

map.ind.regions1 = readShapePoly("/Users/chittampalliyashaswini/Desktop/Yadu/IND_adm_shp/IND_adm1.shp", proj4string=CRS("+proj=longlat +datum=NAD27"))
map.ind.regions1 = fortify(map.ind.regions1, region = "NAME_1")
map.ind.regions1 = rename(map.ind.regions1,x=long,y=lat)

mycolors = brewer.pal(9,"BrBG")
ggplot(data=dat.india.province) + geom_map(aes(fill=GINI, map_id=region),map=map.ind.regions1) + expand_limits(map.ind.regions1) + coord_map("polyconic") + theme_bw() + scale_fill_gradientn(name="GINI", colours = mycolors) + theme(legend.justification=c(1,0),legend.position=c(1,0),legend.background=element_rect(colour="black"))

mycolors2 = brewer.pal(9,"OrRd")
ggplot(data=dat.india.provincecomp) + geom_map(aes(fill=`Rural Gini Coefficient Percent Change`, map_id=region),map=map.ind.regions1) + expand_limits(map.ind.regions1) + coord_map("polyconic") + theme_bw() + scale_fill_gradientn(name="Rural GINI % Change", colours = mycolors2) + theme(legend.justification=c(1,0),legend.position=c(1,0),legend.background=element_rect(colour="black"))

mycolors3 = brewer.pal(9,"Blues")
ggplot(data=dat.india.provincecomp) + geom_map(aes(fill=`Urban Gini Coefficient Percent Change`, map_id=region),map=map.ind.regions1) + expand_limits(map.ind.regions1) + coord_map("polyconic") + theme_bw() + scale_fill_gradientn(name="Urban GINI % Change", colours = mycolors3) + theme(legend.justification=c(1,0),legend.position=c(1,0),legend.background=element_rect(colour="black"))

#grid.arrange(plot1, plot2, plot3, top = textGrob("Maps of India", gp = gpar(fontface = "bold")), ncol = 1, nrow = 3)

http://gis.stackexchange.com/questions/102781/chloropeth-map-in-r-data-on-map-not-represented-as-in-listed-in-the-file

india webscraping

Yadu

April 23, 2016