Source file ⇒ categoricalvisualisationproject.rmd
Exploratory data analysis
REVEAL THYSELF……………
library(foreign)
allvariables.1<-read.spss("all variables.111.sav")
#allvariables.1<-read.csv("all variables.1.csv",header = T)
#converting lists into a data frame only when importing an SPSS file
allvariables.1<-as.data.frame(allvariables.1)
class(allvariables.1)
## [1] "data.frame"
## [1] "data.frame"
#knowing the class of each col
sapply(allvariables.1,class)
## STATEID DISTID VNID ROOMS INCOME
## "factor" "numeric" "numeric" "numeric" "numeric"
## EDU.HH WATER RICE.P ELEC.ACCESS ELEC.HRS
## "factor" "factor" "numeric" "factor" "numeric"
## NFE ASSETS NADULTM NADULTF NCHILDM
## "factor" "numeric" "numeric" "numeric" "numeric"
## NCHILDF NTEENM NTEENF URBAN2011 NFE.TYPES
## "numeric" "numeric" "numeric" "factor" "factor"
## NADULT NCHILD NTEEN ELEC.HRS.3 ELEC.HRS.4
## "numeric" "numeric" "numeric" "factor" "factor"
## filter_. ELEC.HRS.3.NEW
## "factor" "factor"
## 'data.frame': 39954 obs. of 27 variables:
## $ STATEID : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ DISTID : num 2 2 2 2 2 2 2 2 2 2 ...
## $ VNID : num 1 1 1 1 1 1 1 1 1 1 ...
## $ ROOMS : num 12 10 3 4 10 5 5 2 7 2 ...
## $ INCOME : num 176100 1039150 182340 90760 212600 ...
## $ EDU.HH : Factor w/ 18 levels "none 0","1st class 1",..: 1 1 11 1 1 1 1 1 5 1 ...
## $ WATER : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 1 2 2 ...
## $ RICE.P : num 17 20 15 20 20 12 25 25 12 25 ...
## $ ELEC.ACCESS : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ELEC.HRS : num 12 8 8 8 8 14 3 22 22 8 ...
## $ NFE : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ASSETS : num 20 24 22 16 17 12 13 7 11 10 ...
## $ NADULTM : num 2 5 1 1 3 4 2 1 1 1 ...
## $ NADULTF : num 2 3 2 1 3 3 2 1 1 1 ...
## $ NCHILDM : num 2 3 1 1 1 2 0 4 3 2 ...
## $ NCHILDF : num 2 3 1 1 1 2 0 1 0 0 ...
## $ NTEENM : num 1 1 1 0 2 0 0 0 2 0 ...
## $ NTEENF : num 0 1 1 1 0 0 4 1 1 0 ...
## $ URBAN2011 : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ NFE.TYPES : Factor w/ 71 levels "Agriculture 0",..: 49 NA 65 NA NA NA 37 NA NA 46 ...
## $ NADULT : num 4 8 3 2 6 7 4 2 2 2 ...
## $ NCHILD : num 4 6 2 2 2 4 0 5 3 2 ...
## $ NTEEN : num 1 2 2 1 2 0 4 1 3 0 ...
## $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
## $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
## $ filter_. : Factor w/ 2 levels "Not Selected",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ ELEC.HRS.3.NEW: Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
## STATEID DISTID VNID ROOMS INCOME EDU.HH WATER RICE.P
## 1 Jammu & Kashmir 01 2 1 12 176100 none 0 Yes 1 17
## 2 Jammu & Kashmir 01 2 1 10 1039150 none 0 Yes 1 20
## 3 Jammu & Kashmir 01 2 1 3 182340 Secondary 10 Yes 1 15
## 4 Jammu & Kashmir 01 2 1 4 90760 none 0 Yes 1 20
## 5 Jammu & Kashmir 01 2 1 10 212600 none 0 Yes 1 20
## 6 Jammu & Kashmir 01 2 1 5 152100 none 0 Yes 1 12
## ELEC.ACCESS ELEC.HRS NFE ASSETS NADULTM NADULTF NCHILDM NCHILDF NTEENM
## 1 Yes 1 12 No 0 20 2 2 2 2 1
## 2 Yes 1 8 No 0 24 5 3 3 3 1
## 3 Yes 1 8 No 0 22 1 2 1 1 1
## 4 Yes 1 8 No 0 16 1 1 1 1 0
## 5 Yes 1 8 No 0 17 3 3 1 1 2
## 6 Yes 1 14 No 0 12 4 3 2 2 0
## NTEENF URBAN2011 NFE.TYPES NADULT NCHILD NTEEN ELEC.HRS.3
## 1 0 rural 0 Land transport 70 4 4 1 1-16
## 2 1 rural 0 <NA> 8 6 2 1-16
## 3 1 rural 0 Medical 93 3 2 2 1-16
## 4 1 rural 0 <NA> 2 2 1 1-16
## 5 0 rural 0 <NA> 6 2 2 1-16
## 6 0 rural 0 <NA> 7 4 0 1-16
## ELEC.HRS.4 filter_. ELEC.HRS.3.NEW
## 1 12-18 Selected 1-16
## 2 6-12 Selected 1-16
## 3 6-12 Selected 1-16
## 4 6-12 Selected 1-16
## 5 6-12 Selected 1-16
## 6 12-18 Selected 1-16
## STATEID DISTID VNID ROOMS INCOME EDU.HH WATER RICE.P
## 39949 Tamil Nadu 33 30 11 5 27000 7th class 7 No 0 26
## 39950 Tamil Nadu 33 30 11 2 9000 none 0 Yes 1 26
## 39951 Tamil Nadu 33 30 11 3 110900 Secondary 10 No 0 18
## 39952 Tamil Nadu 33 30 11 2 15000 5th class 5 No 0 26
## 39953 Tamil Nadu 33 30 11 1 69040 7th class 7 No 0 23
## 39954 Tamil Nadu 33 30 11 2 37000 6th class 6 No 0 23
## ELEC.ACCESS ELEC.HRS NFE ASSETS NADULTM NADULTF NCHILDM NCHILDF
## 39949 Yes 1 6 Yes 1 26 1 1 0 0
## 39950 No 0 NA No 0 8 0 1 0 0
## 39951 Yes 1 6 No 0 17 1 2 1 0
## 39952 Yes 1 6 Yes 1 19 2 1 1 1
## 39953 Yes 1 8 No 0 15 1 3 0 1
## 39954 Yes 1 6 Yes 1 18 1 1 0 1
## NTEENM NTEENF URBAN2011 NFE.TYPES NADULT NCHILD NTEEN
## 39949 0 0 urban 1 Retail household 67 2 0 0
## 39950 0 0 urban 1 Air transport 72 1 0 0
## 39951 0 2 urban 1 Retail textiles 66 3 1 2
## 39952 0 0 urban 1 Retail nec 68 3 2 0
## 39953 0 0 urban 1 Personal services 96 4 1 0
## 39954 1 1 urban 1 Retail food 65 2 1 2
## ELEC.HRS.3 ELEC.HRS.4 filter_. ELEC.HRS.3.NEW
## 39949 1-16 6-12 Not Selected 1-16
## 39950 <NA> <NA> Not Selected <NA>
## 39951 1-16 6-12 Not Selected 1-16
## 39952 1-16 6-12 Not Selected 1-16
## 39953 1-16 6-12 Not Selected 1-16
## 39954 1-16 6-12 Not Selected 1-16
## [1] 39954 27
summary(allvariables.1) # gives the info about NAs
## STATEID DISTID VNID
## Karnataka 29 : 3865 Min. : 1.00 Min. : 1.000
## Uttar Pradesh 09 : 3824 1st Qu.: 7.00 1st Qu.: 2.000
## Maharashtra 27 : 3309 Median :12.00 Median : 4.000
## Madhya Pradesh 23: 3123 Mean :15.43 Mean : 5.608
## Rajasthan 08 : 2707 3rd Qu.:21.00 3rd Qu.: 7.000
## West Bengal 19 : 2435 Max. :68.00 Max. :39.000
## (Other) :20691
## ROOMS INCOME EDU.HH WATER
## Min. : 0.000 Min. :-1037040 none 0 :24520 No 0 :27201
## 1st Qu.: 2.000 1st Qu.: 38220 5th class 5 : 3514 Yes 1:12613
## Median : 2.000 Median : 72470 Secondary 10: 2057 NA's : 140
## Mean : 2.684 Mean : 125336 4th class 4 : 1673
## 3rd Qu.: 3.000 3rd Qu.: 141500 8th class 8 : 1524
## Max. :50.000 Max. :11360000 (Other) : 6390
## NA's :172 NA's : 276
## RICE.P ELEC.ACCESS ELEC.HRS NFE
## Min. : 0.00 No 0 : 4970 Min. : 0.00 No 0 :31564
## 1st Qu.: 18.00 Yes 1:34840 1st Qu.: 9.00 Yes 1: 8390
## Median : 20.00 NA's : 144 Median :16.00
## Mean : 21.92 Mean :15.32
## 3rd Qu.: 26.00 3rd Qu.:22.00
## Max. :120.00 Max. :24.00
## NA's :5197
## ASSETS NADULTM NADULTF NCHILDM
## Min. : 0.00 Min. :0.000 Min. :0.000 Min. : 0.0000
## 1st Qu.:10.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 0.0000
## Median :16.00 Median :1.000 Median :1.000 Median : 0.0000
## Mean :15.39 Mean :1.424 Mean :1.492 Mean : 0.7157
## 3rd Qu.:21.00 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.: 1.0000
## Max. :33.00 Max. :9.000 Max. :9.000 Max. :10.0000
## NA's :18
## NCHILDF NTEENM NTEENF URBAN2011
## Min. : 0.0000 Min. :0.0000 Min. :0.0000 rural 0:26134
## 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000 urban 1:13820
## Median : 0.0000 Median :0.0000 Median :0.0000
## Mean : 0.6566 Mean :0.2852 Mean :0.2926
## 3rd Qu.: 1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :10.0000 Max. :5.0000 Max. :5.0000
##
## NFE.TYPES NADULT NCHILD
## Agriculture 0 : 7570 Min. : 0.000 Min. : 0.000
## Construction 50 : 3190 1st Qu.: 2.000 1st Qu.: 0.000
## Public admin 90 : 1565 Median : 2.000 Median : 1.000
## Retail food 65 : 1230 Mean : 2.917 Mean : 1.372
## Land transport 70: 954 3rd Qu.: 4.000 3rd Qu.: 2.000
## (Other) : 9042 Max. :18.000 Max. :18.000
## NA's :16403
## NTEEN ELEC.HRS.3 ELEC.HRS.4 filter_.
## Min. :0.0000 No Access: 22 0-6 : 2975 Not Selected:36056
## 1st Qu.:0.0000 1-16 :16091 6-12 : 8550 Selected : 3898
## Median :0.0000 17-24 :18644 12-18: 6252
## Mean :0.5778 NA's : 5197 18-24:16980
## 3rd Qu.:1.0000 NA's : 5197
## Max. :7.0000
##
## ELEC.HRS.3.NEW
## No Access: 92
## 1-16 :17334
## 16-24 :17331
## NA's : 5197
##
##
##
#utils::View(allvariables.1) Shows the entire data set
# allvariables.1$STATEID<-as.factor(as.character(allvariables.1$STATEID))
# allvariables.1$EDU.HH<-as.factor(as.character(allvariables.1$EDU.HH))
# allvariables.1$WATER<-as.factor(as.character(allvariables.1$WATER))
# allvariables.1$ ELEC.ACCESS<-as.factor(as.character(allvariables.1$ ELEC.ACCESS))
# allvariables.1$NFE<-as.factor(as.character(allvariables.1$NFE))
# allvariables.1$URBAN2011<-as.factor(as.character(allvariables.1$URBAN2011))
# allvariables.1$NFE.TYPES<-as.factor(as.character(allvariables.1$NFE.TYPES))
# allvariables.1$ELEC.HRS.3<-as.factor(as.character(allvariables.1$ELEC.HRS.3))
# allvariables.1$ELEC.HRS.4<-as.factor(as.character(allvariables.1$ELEC.HRS.4))
# allvariables.1$filter_.<-as.factor(as.character(allvariables.1$filter_.))
# sapply(allvariables.1,class)
# str(allvariables.1)
# summary(allvariables.1)
#total no of rows having NAs
rownos<-which(apply(is.na(allvariables.1),1,sum)>0)
length(rownos) # so we have at 19278 rows with atleast one NA
## [1] 19278
rowsums<-sum(apply(is.na(allvariables.1),1,sum)>0)
rowsums
## [1] 19278
#NAs count coulmn wise
na_count <-sapply(allvariables.1, function(y) sum(length(which(is.na(y)))))
(na_count <- data.frame(na_count))
## na_count
## STATEID 0
## DISTID 0
## VNID 0
## ROOMS 172
## INCOME 0
## EDU.HH 276
## WATER 140
## RICE.P 0
## ELEC.ACCESS 144
## ELEC.HRS 5197
## NFE 0
## ASSETS 18
## NADULTM 0
## NADULTF 0
## NCHILDM 0
## NCHILDF 0
## NTEENM 0
## NTEENF 0
## URBAN2011 0
## NFE.TYPES 16403
## NADULT 0
## NCHILD 0
## NTEEN 0
## ELEC.HRS.3 5197
## ELEC.HRS.4 5197
## filter_. 0
## ELEC.HRS.3.NEW 5197
#or
apply(allvariables.1, 2, function(x)sum(is.na(x)))
## STATEID DISTID VNID ROOMS INCOME
## 0 0 0 172 0
## EDU.HH WATER RICE.P ELEC.ACCESS ELEC.HRS
## 276 140 0 144 5197
## NFE ASSETS NADULTM NADULTF NCHILDM
## 0 18 0 0 0
## NCHILDF NTEENM NTEENF URBAN2011 NFE.TYPES
## 0 0 0 0 16403
## NADULT NCHILD NTEEN ELEC.HRS.3 ELEC.HRS.4
## 0 0 0 5197 5197
## filter_. ELEC.HRS.3.NEW
## 0 5197
#Lets see which col has NAs
colnos<-which(apply(is.na(allvariables.1),2,sum)>0)
colnos
## ROOMS EDU.HH WATER ELEC.ACCESS ELEC.HRS
## 4 6 7 9 10
## ASSETS NFE.TYPES ELEC.HRS.3 ELEC.HRS.4 ELEC.HRS.3.NEW
## 12 20 24 25 27
#lets remove var NFE.TYPES is has huge no of NAs
allvariables.1$NFE.TYPES<-NULL
#Removing rows having even one NA
allvariables.1nonas<-na.omit(allvariables.1)
str(allvariables.1nonas)
## 'data.frame': 34442 obs. of 26 variables:
## $ STATEID : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ DISTID : num 2 2 2 2 2 2 2 2 2 2 ...
## $ VNID : num 1 1 1 1 1 1 1 1 1 1 ...
## $ ROOMS : num 12 10 3 4 10 5 5 2 7 2 ...
## $ INCOME : num 176100 1039150 182340 90760 212600 ...
## $ EDU.HH : Factor w/ 18 levels "none 0","1st class 1",..: 1 1 11 1 1 1 1 1 5 1 ...
## $ WATER : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 1 2 2 ...
## $ RICE.P : num 17 20 15 20 20 12 25 25 12 25 ...
## $ ELEC.ACCESS : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ELEC.HRS : num 12 8 8 8 8 14 3 22 22 8 ...
## $ NFE : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ASSETS : num 20 24 22 16 17 12 13 7 11 10 ...
## $ NADULTM : num 2 5 1 1 3 4 2 1 1 1 ...
## $ NADULTF : num 2 3 2 1 3 3 2 1 1 1 ...
## $ NCHILDM : num 2 3 1 1 1 2 0 4 3 2 ...
## $ NCHILDF : num 2 3 1 1 1 2 0 1 0 0 ...
## $ NTEENM : num 1 1 1 0 2 0 0 0 2 0 ...
## $ NTEENF : num 0 1 1 1 0 0 4 1 1 0 ...
## $ URBAN2011 : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ NADULT : num 4 8 3 2 6 7 4 2 2 2 ...
## $ NCHILD : num 4 6 2 2 2 4 0 5 3 2 ...
## $ NTEEN : num 1 2 2 1 2 0 4 1 3 0 ...
## $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
## $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
## $ filter_. : Factor w/ 2 levels "Not Selected",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ ELEC.HRS.3.NEW: Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:5512] 42 148 189 198 221 234 270 272 274 278 ...
## .. ..- attr(*, "names")= chr [1:5512] "42" "148" "189" "198" ...
summary(allvariables.1nonas)
## STATEID DISTID VNID
## Karnataka 29 : 3546 Min. : 1.0 Min. : 1.000
## Maharashtra 27 : 3095 1st Qu.: 6.0 1st Qu.: 2.000
## Madhya Pradesh 23: 2532 Median :12.0 Median : 4.000
## Uttar Pradesh 09 : 2326 Mean :14.6 Mean : 5.788
## Rajasthan 08 : 2296 3rd Qu.:20.0 3rd Qu.: 7.000
## Andhra Pradesh 28: 2118 Max. :68.0 Max. :39.000
## (Other) :18529
## ROOMS INCOME EDU.HH WATER
## Min. : 1.000 Min. :-1037040 none 0 :20413 No 0 :22154
## 1st Qu.: 2.000 1st Qu.: 43500 5th class 5 : 3201 Yes 1:12288
## Median : 2.000 Median : 80500 Secondary 10: 1960
## Mean : 2.791 Mean : 136009 4th class 4 : 1551
## 3rd Qu.: 4.000 3rd Qu.: 154000 8th class 8 : 1417
## Max. :50.000 Max. :11360000 2nd class 2 : 1246
## (Other) : 4654
## RICE.P ELEC.ACCESS ELEC.HRS NFE
## Min. : 0.00 No 0 : 0 Min. : 0.00 No 0 :26760
## 1st Qu.: 18.00 Yes 1:34442 1st Qu.: 9.00 Yes 1: 7682
## Median : 22.00 Median :16.00
## Mean : 22.59 Mean :15.32
## 3rd Qu.: 27.00 3rd Qu.:22.00
## Max. :120.00 Max. :24.00
##
## ASSETS NADULTM NADULTF NCHILDM
## Min. : 1.00 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:12.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :17.00 Median :1.000 Median :1.000 Median :0.0000
## Mean :16.68 Mean :1.463 Mean :1.528 Mean :0.6926
## 3rd Qu.:21.00 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :33.00 Max. :9.000 Max. :9.000 Max. :8.0000
##
## NCHILDF NTEENM NTEENF URBAN2011
## Min. : 0.0000 Min. :0.0000 Min. :0.0000 rural 0:21240
## 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000 urban 1:13202
## Median : 0.0000 Median :0.0000 Median :0.0000
## Mean : 0.6318 Mean :0.2868 Mean :0.2926
## 3rd Qu.: 1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :10.0000 Max. :5.0000 Max. :5.0000
##
## NADULT NCHILD NTEEN ELEC.HRS.3
## Min. : 0.000 Min. : 0.000 Min. :0.0000 No Access: 22
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.:0.0000 1-16 :15932
## Median : 3.000 Median : 1.000 Median :0.0000 17-24 :18488
## Mean : 2.991 Mean : 1.324 Mean :0.5795
## 3rd Qu.: 4.000 3rd Qu.: 2.000 3rd Qu.:1.0000
## Max. :18.000 Max. :18.000 Max. :7.0000
##
## ELEC.HRS.4 filter_. ELEC.HRS.3.NEW
## 0-6 : 2930 Not Selected:30607 No Access: 91
## 6-12 : 8491 Selected : 3835 1-16 :17168
## 12-18: 6189 16-24 :17183
## 18-24:16832
##
##
##
## [1] 34442 26
head(allvariables.1nonas)
## STATEID DISTID VNID ROOMS INCOME EDU.HH WATER RICE.P
## 1 Jammu & Kashmir 01 2 1 12 176100 none 0 Yes 1 17
## 2 Jammu & Kashmir 01 2 1 10 1039150 none 0 Yes 1 20
## 3 Jammu & Kashmir 01 2 1 3 182340 Secondary 10 Yes 1 15
## 4 Jammu & Kashmir 01 2 1 4 90760 none 0 Yes 1 20
## 5 Jammu & Kashmir 01 2 1 10 212600 none 0 Yes 1 20
## 6 Jammu & Kashmir 01 2 1 5 152100 none 0 Yes 1 12
## ELEC.ACCESS ELEC.HRS NFE ASSETS NADULTM NADULTF NCHILDM NCHILDF NTEENM
## 1 Yes 1 12 No 0 20 2 2 2 2 1
## 2 Yes 1 8 No 0 24 5 3 3 3 1
## 3 Yes 1 8 No 0 22 1 2 1 1 1
## 4 Yes 1 8 No 0 16 1 1 1 1 0
## 5 Yes 1 8 No 0 17 3 3 1 1 2
## 6 Yes 1 14 No 0 12 4 3 2 2 0
## NTEENF URBAN2011 NADULT NCHILD NTEEN ELEC.HRS.3 ELEC.HRS.4 filter_.
## 1 0 rural 0 4 4 1 1-16 12-18 Selected
## 2 1 rural 0 8 6 2 1-16 6-12 Selected
## 3 1 rural 0 3 2 2 1-16 6-12 Selected
## 4 1 rural 0 2 2 1 1-16 6-12 Selected
## 5 0 rural 0 6 2 2 1-16 6-12 Selected
## 6 0 rural 0 7 4 0 1-16 12-18 Selected
## ELEC.HRS.3.NEW
## 1 1-16
## 2 1-16
## 3 1-16
## 4 1-16
## 5 1-16
## 6 1-16
tail(allvariables.1nonas)
## STATEID DISTID VNID ROOMS INCOME EDU.HH WATER RICE.P
## 39948 Tamil Nadu 33 30 11 2 97400 6th class 6 No 0 26
## 39949 Tamil Nadu 33 30 11 5 27000 7th class 7 No 0 26
## 39951 Tamil Nadu 33 30 11 3 110900 Secondary 10 No 0 18
## 39952 Tamil Nadu 33 30 11 2 15000 5th class 5 No 0 26
## 39953 Tamil Nadu 33 30 11 1 69040 7th class 7 No 0 23
## 39954 Tamil Nadu 33 30 11 2 37000 6th class 6 No 0 23
## ELEC.ACCESS ELEC.HRS NFE ASSETS NADULTM NADULTF NCHILDM NCHILDF
## 39948 Yes 1 6 No 0 18 3 2 1 0
## 39949 Yes 1 6 Yes 1 26 1 1 0 0
## 39951 Yes 1 6 No 0 17 1 2 1 0
## 39952 Yes 1 6 Yes 1 19 2 1 1 1
## 39953 Yes 1 8 No 0 15 1 3 0 1
## 39954 Yes 1 6 Yes 1 18 1 1 0 1
## NTEENM NTEENF URBAN2011 NADULT NCHILD NTEEN ELEC.HRS.3 ELEC.HRS.4
## 39948 1 0 urban 1 5 1 1 1-16 6-12
## 39949 0 0 urban 1 2 0 0 1-16 6-12
## 39951 0 2 urban 1 3 1 2 1-16 6-12
## 39952 0 0 urban 1 3 2 0 1-16 6-12
## 39953 0 0 urban 1 4 1 0 1-16 6-12
## 39954 1 1 urban 1 2 1 2 1-16 6-12
## filter_. ELEC.HRS.3.NEW
## 39948 Not Selected 1-16
## 39949 Not Selected 1-16
## 39951 Not Selected 1-16
## 39952 Not Selected 1-16
## 39953 Not Selected 1-16
## 39954 Not Selected 1-16
# # Bar Plot of the STATEID faceted by ELEC.HRS.3
table(allvariables.1nonas$STATEID,allvariables.1nonas$ELEC.HRS.3)
##
## No Access 1-16 17-24
## Jammu & Kashmir 01 0 529 154
## Himachal Pradesh 02 0 63 1405
## Punjab 03 0 161 1523
## Chandigarh 04 0 0 0
## Uttarakhand 05 0 234 209
## Haryana 06 0 1546 159
## Delhi 07 0 88 799
## Rajasthan 08 0 1253 1043
## Uttar Pradesh 09 1 2032 293
## Bihar 10 0 855 101
## Sikkim 11 0 0 0
## Arunachal Pradesh 12 0 0 0
## Nagaland 13 0 0 0
## Manipur 14 0 0 0
## Mizoram 15 0 0 0
## Tripura 16 0 0 0
## Meghalaya 17 0 0 0
## Assam 18 0 0 0
## West Bengal 19 0 375 1564
## Jharkhand 20 0 321 408
## Orissa 21 0 430 1068
## Chhattisgarh 22 0 186 1001
## Madhya Pradesh 23 20 1942 570
## Gujarat 24 0 43 1734
## Daman & Diu 25 0 0 0
## Dadra+Nagar Haveli 26 0 0 0
## Maharashtra 27 0 1334 1761
## Andhra Pradesh 28 0 1125 993
## Karnataka 29 0 1847 1699
## Goa 30 0 95 91
## Lakshadweep 31 0 0 0
## Kerala 32 0 50 1468
## Tamil Nadu 33 1 1423 445
## Pondicherry 34 0 0 0
## Anadman/Nicobar 35 0 0 0
library(ggplot2)
theme_set(theme_bw())
ggplot(allvariables.1nonas, aes(STATEID)) +
geom_bar(aes(fill = ELEC.HRS.3)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + facet_wrap(~ELEC.HRS.3,ncol = 1,scales = "free_y") + scale_y_continuous(breaks = seq(0,2200, by = 200))+ guides(fill = FALSE)

#scales = "free_y" argument gives each facet own y ticks
theme_set(theme_bw())
ggplot(allvariables.1nonas, aes(ELEC.HRS.3)) + geom_bar(aes(fill = ELEC.HRS.3)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + facet_wrap(~STATEID,scales = "free_y") + guides(fill = F)

theme_set(theme_bw())
ggplot(allvariables.1nonas, aes(EDU.HH)) +
geom_bar(aes(fill = ELEC.HRS.4)) + facet_wrap(~ELEC.HRS.4,ncol = 1, scales = "free_y" ) + theme(axis.text.x = element_text(angle = 90, face = "bold", size = 8)) + guides(fill = FALSE)

# Bar Plot of the EDU.HH faceted by URBAN2011 and ELEC.HRS.4
theme_set(theme_bw())
ggplot(allvariables.1nonas, aes(EDU.HH)) +
geom_bar(aes(fill = ELEC.HRS.4)) + facet_wrap(URBAN2011~ELEC.HRS.4,ncol = 4,scales = "free_y") + theme(axis.text.x = element_text(angle = 90, size = 8)) + guides(fill = FALSE)

#Bar chart with standard errors
library(tidyr)
library(dplyr)
library(ggplot2)
library(knitr)
library(scales)
library(RColorBrewer)
#custom std error function
std_err <- function(x){
sd(x) / sqrt(length(x))
}
library(dplyr)
df_ind_sum <-allvariables.1 %>%
group_by(STATEID) %>%
summarise(Income_med = median(INCOME),Income_se = std_err(INCOME))
kable(df_ind_sum)
| Jammu & Kashmir 01 |
149475 |
9080.923 |
| Himachal Pradesh 02 |
104430 |
7858.078 |
| Punjab 03 |
116000 |
7131.363 |
| Uttarakhand 05 |
81970 |
6768.412 |
| Haryana 06 |
98475 |
10288.974 |
| Delhi 07 |
170000 |
10714.613 |
| Rajasthan 08 |
76200 |
3296.097 |
| Uttar Pradesh 09 |
55220 |
2120.620 |
| Bihar 10 |
49990 |
2721.459 |
| West Bengal 19 |
63000 |
5453.198 |
| Jharkhand 20 |
55500 |
4307.792 |
| Orissa 21 |
47625 |
2610.151 |
| Chhattisgarh 22 |
40386 |
3682.346 |
| Madhya Pradesh 23 |
47000 |
2955.330 |
| Gujarat 24 |
76800 |
5924.410 |
| Maharashtra 27 |
81800 |
2888.217 |
| Andhra Pradesh 28 |
62000 |
2271.968 |
| Karnataka 29 |
73075 |
3688.832 |
| Goa 30 |
103500 |
8841.920 |
| Kerala 32 |
134680 |
4764.551 |
| Tamil Nadu 33 |
89600 |
3338.673 |
table(allvariables.1$STATEID)
##
## Jammu & Kashmir 01 Himachal Pradesh 02 Punjab 03
## 720 1476 1702
## Chandigarh 04 Uttarakhand 05 Haryana 06
## 0 468 1806
## Delhi 07 Rajasthan 08 Uttar Pradesh 09
## 899 2707 3824
## Bihar 10 Sikkim 11 Arunachal Pradesh 12
## 1547 0 0
## Nagaland 13 Manipur 14 Mizoram 15
## 0 0 0
## Tripura 16 Meghalaya 17 Assam 18
## 0 0 0
## West Bengal 19 Jharkhand 20 Orissa 21
## 2435 853 2058
## Chhattisgarh 22 Madhya Pradesh 23 Gujarat 24
## 1324 3123 1895
## Daman & Diu 25 Dadra+Nagar Haveli 26 Maharashtra 27
## 0 0 3309
## Andhra Pradesh 28 Karnataka 29 Goa 30
## 2203 3865 188
## Lakshadweep 31 Kerala 32 Tamil Nadu 33
## 0 1570 1982
## Pondicherry 34 Anadman/Nicobar 35
## 0 0
ggplot(df_ind_sum, aes(x = STATEID, y = Income_med)) +
geom_bar(stat="identity") + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2)

#v.2
ggplot(df_ind_sum, aes(x = STATEID, y = Income_med)) +
geom_bar(stat="identity") + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + scale_y_continuous(breaks = seq(0, 200000, by = 25000)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2) # changing the y ticks scale

#v 3
#
# This is the function with various ggplot2 parameters that can be customized
#
ggplot_theme <- function() {
# Generate the colors for the chart procedurally with RColorBrewer
palette <- brewer.pal("Greys", n=9)
color.background = "white" #palette[2]
color.grid.major = palette[3]
color.axis.text = palette[6]
color.axis.title = palette[7]
color.title = palette[9]
# Begin construction of chart
theme_bw(base_size=9) +
# Set the entire chart region to a light gray color
theme(panel.background=element_rect(fill=color.background, color=color.background)) +
theme(plot.background=element_rect(fill=color.background, color=color.background)) +
theme(panel.border=element_rect(color=color.background)) +
# Format the grid
theme(panel.grid.major=element_line(color=color.grid.major,size=.25)) +
theme(panel.grid.minor=element_blank()) +
theme(axis.ticks=element_blank()) +
# Format the legend, but hide by default
theme(legend.position="none") +
theme(legend.background = element_rect(fill=color.background)) +
theme(legend.text = element_text(size=7,color=color.axis.title)) +
# Set title and axis labels, and format these and tick marks
theme(plot.title=element_text(color=color.title, size=12, vjust=1.25)) +
theme(axis.text.x=element_text(size=7,color=color.axis.text)) +
theme(axis.text.y=element_text(size=7,color=color.axis.text)) +
theme(axis.title.x=element_text(size=10,color=color.axis.title, vjust=0)) +
theme(axis.title.y=element_text(size=10,color=color.axis.title, vjust=1.25)) +
# Plot margins
theme(plot.margin = unit(c(0.35, 0.2, 0.3, 0.35), "cm"))
}
ggplot(df_ind_sum, aes(x = STATEID, y = Income_med)) +
geom_bar(stat="identity",fill="#c0392b",alpha=0.75) + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2) + coord_flip()+ggplot_theme()

#reversing the order of the x axis--'states' labels using scale_x_discrete(labels=rev(df_ind_sum$STATEID))
ggplot(df_ind_sum, aes(x = STATEID, y = Income_med)) +
geom_bar(stat="identity",fill="#c0392b",alpha=0.75) + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + scale_x_discrete(labels=rev(df_ind_sum$STATEID)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2) + coord_flip()+ggplot_theme()

###################
ggplot(df_ind_sum, aes(x = STATEID, y = Income_med,fill=STATEID)) + geom_bar(stat="identity") + labs(title = "Median Income Statewise", x = "States", y = "Median Income") + theme(axis.text= element_text(size=8, angle=90)) + geom_errorbar(aes(ymin = Income_med - Income_se,ymax = Income_med + Income_se),width=.2)+coord_polar()

#Line chart using mean of Income
df_ind_mean<-allvariables.1 %>%
group_by(STATEID,ELEC.HRS.3) %>%
summarise(Income_mean = mean(INCOME),TotalAssets=sum(ASSETS))
kable(df_ind_mean)
| Jammu & Kashmir 01 |
1-16 |
213974.15 |
9986 |
| Jammu & Kashmir 01 |
17-24 |
260984.08 |
3470 |
| Jammu & Kashmir 01 |
NA |
345185.00 |
31 |
| Himachal Pradesh 02 |
1-16 |
129297.44 |
1061 |
| Himachal Pradesh 02 |
17-24 |
177701.44 |
25650 |
| Himachal Pradesh 02 |
NA |
76081.25 |
80 |
| Punjab 03 |
1-16 |
160272.65 |
3173 |
| Punjab 03 |
17-24 |
202844.17 |
32231 |
| Punjab 03 |
NA |
70748.76 |
147 |
| Uttarakhand 05 |
1-16 |
112795.15 |
3830 |
| Uttarakhand 05 |
17-24 |
164542.92 |
3913 |
| Uttarakhand 05 |
NA |
38540.83 |
176 |
| Haryana 06 |
1-16 |
169019.06 |
NA |
| Haryana 06 |
17-24 |
305111.98 |
NA |
| Haryana 06 |
NA |
122365.95 |
859 |
| Delhi 07 |
1-16 |
172350.00 |
1920 |
| Delhi 07 |
17-24 |
265812.55 |
18113 |
| Delhi 07 |
NA |
230660.00 |
58 |
| Rajasthan 08 |
1-16 |
112094.43 |
17438 |
| Rajasthan 08 |
17-24 |
164266.39 |
NA |
| Rajasthan 08 |
NA |
70739.98 |
2951 |
| Uttar Pradesh 09 |
No Access |
132460.00 |
12 |
| Uttar Pradesh 09 |
1-16 |
114182.21 |
31337 |
| Uttar Pradesh 09 |
17-24 |
147426.82 |
5537 |
| Uttar Pradesh 09 |
NA |
50347.38 |
10809 |
| Bihar 10 |
1-16 |
97682.27 |
10970 |
| Bihar 10 |
17-24 |
139167.57 |
1853 |
| Bihar 10 |
NA |
44209.16 |
NA |
| West Bengal 19 |
1-16 |
130822.35 |
5341 |
| West Bengal 19 |
17-24 |
131854.48 |
24971 |
| West Bengal 19 |
NA |
54718.13 |
3343 |
| Jharkhand 20 |
1-16 |
91288.91 |
4605 |
| Jharkhand 20 |
17-24 |
122659.47 |
6487 |
| Jharkhand 20 |
NA |
41349.73 |
681 |
| Orissa 21 |
1-16 |
69659.57 |
5409 |
| Orissa 21 |
17-24 |
110416.24 |
16198 |
| Orissa 21 |
NA |
42482.44 |
2840 |
| Chhattisgarh 22 |
1-16 |
62157.20 |
1935 |
| Chhattisgarh 22 |
17-24 |
96740.57 |
14100 |
| Chhattisgarh 22 |
NA |
28314.21 |
649 |
| Madhya Pradesh 23 |
No Access |
22116.30 |
104 |
| Madhya Pradesh 23 |
1-16 |
80403.43 |
NA |
| Madhya Pradesh 23 |
17-24 |
132155.99 |
10287 |
| Madhya Pradesh 23 |
NA |
38475.82 |
3287 |
| Gujarat 24 |
1-16 |
50916.48 |
509 |
| Gujarat 24 |
17-24 |
143961.75 |
NA |
| Gujarat 24 |
NA |
67489.78 |
577 |
| Maharashtra 27 |
1-16 |
125510.31 |
20917 |
| Maharashtra 27 |
17-24 |
147213.11 |
32273 |
| Maharashtra 27 |
NA |
59661.96 |
1390 |
| Andhra Pradesh 28 |
1-16 |
77522.75 |
NA |
| Andhra Pradesh 28 |
17-24 |
98467.15 |
NA |
| Andhra Pradesh 28 |
NA |
41195.14 |
NA |
| Karnataka 29 |
1-16 |
105682.90 |
NA |
| Karnataka 29 |
17-24 |
148221.97 |
NA |
| Karnataka 29 |
NA |
71562.49 |
NA |
| Goa 30 |
1-16 |
133228.14 |
2106 |
| Goa 30 |
17-24 |
139525.05 |
1923 |
| Goa 30 |
NA |
50000.00 |
23 |
| Kerala 32 |
1-16 |
141827.86 |
984 |
| Kerala 32 |
17-24 |
174356.54 |
NA |
| Kerala 32 |
NA |
212943.95 |
607 |
| Tamil Nadu 33 |
No Access |
59500.00 |
16 |
| Tamil Nadu 33 |
1-16 |
124014.88 |
26321 |
| Tamil Nadu 33 |
17-24 |
132274.07 |
8761 |
| Tamil Nadu 33 |
NA |
71132.70 |
489 |
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 66 obs. of 4 variables:
## $ STATEID : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 2 2 2 3 3 3 5 ...
## $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 3 NA 2 3 NA 2 3 NA 2 ...
## $ Income_mean: num 213974 260984 345185 129297 177701 ...
## $ TotalAssets: num 9986 3470 31 1061 25650 ...
## - attr(*, "vars")=List of 1
## ..$ : symbol STATEID
## - attr(*, "drop")= logi TRUE
#lets see the nos of NAs
apply(df_ind_mean, 2, function(x)sum(is.na(x)))
## STATEID ELEC.HRS.3 Income_mean TotalAssets
## 0 21 0 13
#lets remove NAs from the rows that have NAs
df<-which(apply(is.na(df_ind_mean),1,sum)>0)
df # row nos that have NAs
## [1] 3 6 9 12 13 14 15 18 20 21 25 28 31 34 37 40 42 44 46 47 50 51 52
## [24] 53 54 55 56 59 61 62 66
length(df) #no of rows to be deleted
## [1] 31
df_ind_mean<-df_ind_mean[-df,] #removing NAs
str(df_ind_mean)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 35 obs. of 4 variables:
## $ STATEID : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 2 2 3 3 5 5 7 7 ...
## $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 3 2 3 2 3 2 3 2 3 ...
## $ Income_mean: num 213974 260984 129297 177701 160273 ...
## $ TotalAssets: num 9986 3470 1061 25650 3173 ...
## - attr(*, "vars")=List of 1
## ..$ : symbol STATEID
## - attr(*, "drop")= logi TRUE
## - attr(*, "indices")=List of 18
## ..$ : int 0 1
## ..$ : int 2 3
## ..$ : int 4 5
## ..$ : int 6 7
## ..$ : int 8 9
## ..$ : int 10
## ..$ : int 11 12 13
## ..$ : int 14 15
## ..$ : int 16 17
## ..$ : int 18 19
## ..$ : int 20 21
## ..$ : int 22 23
## ..$ : int 24 25
## ..$ : int 26
## ..$ : int 27 28
## ..$ : int 29 30
## ..$ : int 31
## ..$ : int 32 33 34
## - attr(*, "group_sizes")= int 2 2 2 2 2 1 3 2 2 2 ...
## - attr(*, "biggest_group_size")= int 3
## - attr(*, "labels")='data.frame': 18 obs. of 1 variable:
## ..$ STATEID: Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 2 3 5 7 8 9 10 19 20 ...
## ..- attr(*, "vars")=List of 1
## .. ..$ : symbol STATEID
## ..- attr(*, "drop")= logi TRUE
#check again for NAs
#is.na(df_ind_mean)
apply(df_ind_mean, 2, function(x)sum(is.na(x)))
## STATEID ELEC.HRS.3 Income_mean TotalAssets
## 0 0 0 0
theme_set(theme_bw())
ggplot(df_ind_mean, aes(ELEC.HRS.3, Income_mean)) +
geom_bar(stat = "identity") + facet_wrap(~STATEID ) + scale_y_continuous(labels = comma)

#####################
ggplot(df_ind_mean, aes(ELEC.HRS.3, Income_mean, color = STATEID, group = STATEID)) + geom_line() + scale_y_continuous(labels = comma)

# from the plot above we see very few states where households falling in NO access category. Lets recheck the nos again
table(allvariables.1$ELEC.HRS.3)
##
## No Access 1-16 17-24
## 22 16091 18644
df_ind_mean %>%
ggplot(aes(x=TotalAssets)) +
geom_line(aes(y=Income_mean, color=ELEC.HRS.3))

ggplot(df_ind_mean , aes(x = TotalAssets, y = Income_mean)) +
geom_point(size=2, color="steelblue", stroke=1) + facet_wrap(~ ELEC.HRS.3)+ scale_y_continuous(labels = comma)

############################################
#Line chart using median Income
df_ind_median<-allvariables.1 %>%
group_by(STATEID,ELEC.HRS.3) %>%
summarise(Income_median = median(INCOME),TotalAssets=sum(ASSETS))
kable(df_ind_median)
| Jammu & Kashmir 01 |
1-16 |
141000.0 |
9986 |
| Jammu & Kashmir 01 |
17-24 |
174601.0 |
3470 |
| Jammu & Kashmir 01 |
NA |
345185.0 |
31 |
| Himachal Pradesh 02 |
1-16 |
86550.0 |
1061 |
| Himachal Pradesh 02 |
17-24 |
105719.0 |
25650 |
| Himachal Pradesh 02 |
NA |
70820.0 |
80 |
| Punjab 03 |
1-16 |
97160.0 |
3173 |
| Punjab 03 |
17-24 |
120000.0 |
32231 |
| Punjab 03 |
NA |
64100.0 |
147 |
| Uttarakhand 05 |
1-16 |
72000.0 |
3830 |
| Uttarakhand 05 |
17-24 |
108830.0 |
3913 |
| Uttarakhand 05 |
NA |
32490.0 |
176 |
| Haryana 06 |
1-16 |
96412.5 |
NA |
| Haryana 06 |
17-24 |
150400.0 |
NA |
| Haryana 06 |
NA |
62800.0 |
859 |
| Delhi 07 |
1-16 |
105000.0 |
1920 |
| Delhi 07 |
17-24 |
180000.0 |
18113 |
| Delhi 07 |
NA |
84700.0 |
58 |
| Rajasthan 08 |
1-16 |
68982.5 |
17438 |
| Rajasthan 08 |
17-24 |
100000.0 |
NA |
| Rajasthan 08 |
NA |
50695.0 |
2951 |
| Uttar Pradesh 09 |
No Access |
132460.0 |
12 |
| Uttar Pradesh 09 |
1-16 |
70200.0 |
31337 |
| Uttar Pradesh 09 |
17-24 |
87850.0 |
5537 |
| Uttar Pradesh 09 |
NA |
38250.0 |
10809 |
| Bihar 10 |
1-16 |
61500.0 |
10970 |
| Bihar 10 |
17-24 |
87775.0 |
1853 |
| Bihar 10 |
NA |
36420.0 |
NA |
| West Bengal 19 |
1-16 |
58740.0 |
5341 |
| West Bengal 19 |
17-24 |
78925.0 |
24971 |
| West Bengal 19 |
NA |
39850.0 |
3343 |
| Jharkhand 20 |
1-16 |
51500.0 |
4605 |
| Jharkhand 20 |
17-24 |
71330.0 |
6487 |
| Jharkhand 20 |
NA |
33892.5 |
681 |
| Orissa 21 |
1-16 |
47600.0 |
5409 |
| Orissa 21 |
17-24 |
63000.0 |
16198 |
| Orissa 21 |
NA |
32242.5 |
2840 |
| Chhattisgarh 22 |
1-16 |
30002.5 |
1935 |
| Chhattisgarh 22 |
17-24 |
47730.0 |
14100 |
| Chhattisgarh 22 |
NA |
24250.0 |
649 |
| Madhya Pradesh 23 |
No Access |
18965.0 |
104 |
| Madhya Pradesh 23 |
1-16 |
45707.5 |
NA |
| Madhya Pradesh 23 |
17-24 |
88800.0 |
10287 |
| Madhya Pradesh 23 |
NA |
29395.0 |
3287 |
| Gujarat 24 |
1-16 |
37090.0 |
509 |
| Gujarat 24 |
17-24 |
80500.0 |
NA |
| Gujarat 24 |
NA |
46600.0 |
577 |
| Maharashtra 27 |
1-16 |
77007.5 |
20917 |
| Maharashtra 27 |
17-24 |
96000.0 |
32273 |
| Maharashtra 27 |
NA |
53195.0 |
1390 |
| Andhra Pradesh 28 |
1-16 |
58150.0 |
NA |
| Andhra Pradesh 28 |
17-24 |
72330.0 |
NA |
| Andhra Pradesh 28 |
NA |
34625.0 |
NA |
| Karnataka 29 |
1-16 |
69520.0 |
NA |
| Karnataka 29 |
17-24 |
85190.0 |
NA |
| Karnataka 29 |
NA |
53400.0 |
NA |
| Goa 30 |
1-16 |
109500.0 |
2106 |
| Goa 30 |
17-24 |
99000.0 |
1923 |
| Goa 30 |
NA |
50000.0 |
23 |
| Kerala 32 |
1-16 |
118015.0 |
984 |
| Kerala 32 |
17-24 |
135600.0 |
NA |
| Kerala 32 |
NA |
112000.0 |
607 |
| Tamil Nadu 33 |
No Access |
59500.0 |
16 |
| Tamil Nadu 33 |
1-16 |
90000.0 |
26321 |
| Tamil Nadu 33 |
17-24 |
91000.0 |
8761 |
| Tamil Nadu 33 |
NA |
52435.0 |
489 |
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 66 obs. of 4 variables:
## $ STATEID : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 2 2 2 3 3 3 5 ...
## $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 3 NA 2 3 NA 2 3 NA 2 ...
## $ Income_median: num 141000 174601 345185 86550 105719 ...
## $ TotalAssets : num 9986 3470 31 1061 25650 ...
## - attr(*, "vars")=List of 1
## ..$ : symbol STATEID
## - attr(*, "drop")= logi TRUE
#lets see the nos of NAs
apply(df_ind_median, 2, function(x)sum(is.na(x)))
## STATEID ELEC.HRS.3 Income_median TotalAssets
## 0 21 0 13
#lets remove NAs from the rows that have NAs
df<-which(apply(is.na(df_ind_median),1,sum)>0)
df # row nos that have NAs
## [1] 3 6 9 12 13 14 15 18 20 21 25 28 31 34 37 40 42 44 46 47 50 51 52
## [24] 53 54 55 56 59 61 62 66
length(df) #no of rows to be deleted
## [1] 31
df_ind_median<-df_ind_median[-df,] #removing NAs
str(df_ind_median)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 35 obs. of 4 variables:
## $ STATEID : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 2 2 3 3 5 5 7 7 ...
## $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 3 2 3 2 3 2 3 2 3 ...
## $ Income_median: num 141000 174601 86550 105719 97160 ...
## $ TotalAssets : num 9986 3470 1061 25650 3173 ...
## - attr(*, "vars")=List of 1
## ..$ : symbol STATEID
## - attr(*, "drop")= logi TRUE
## - attr(*, "indices")=List of 18
## ..$ : int 0 1
## ..$ : int 2 3
## ..$ : int 4 5
## ..$ : int 6 7
## ..$ : int 8 9
## ..$ : int 10
## ..$ : int 11 12 13
## ..$ : int 14 15
## ..$ : int 16 17
## ..$ : int 18 19
## ..$ : int 20 21
## ..$ : int 22 23
## ..$ : int 24 25
## ..$ : int 26
## ..$ : int 27 28
## ..$ : int 29 30
## ..$ : int 31
## ..$ : int 32 33 34
## - attr(*, "group_sizes")= int 2 2 2 2 2 1 3 2 2 2 ...
## - attr(*, "biggest_group_size")= int 3
## - attr(*, "labels")='data.frame': 18 obs. of 1 variable:
## ..$ STATEID: Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 2 3 5 7 8 9 10 19 20 ...
## ..- attr(*, "vars")=List of 1
## .. ..$ : symbol STATEID
## ..- attr(*, "drop")= logi TRUE
#check again for NAs
#is.na(df_ind_mean)
apply(df_ind_median, 2, function(x)sum(is.na(x)))
## STATEID ELEC.HRS.3 Income_median TotalAssets
## 0 0 0 0
theme_set(theme_bw())
ggplot(df_ind_median, aes(ELEC.HRS.3, Income_median)) +
geom_bar(stat = "identity") + facet_wrap(~STATEID ) + scale_y_continuous(labels = comma)

ggplot(df_ind_median, aes(ELEC.HRS.3, Income_median, color = STATEID, group = STATEID)) + geom_line() + scale_y_continuous(labels = comma)

df_ind_median %>%
ggplot(aes(x=TotalAssets)) +
geom_line(aes(y=Income_median, color=ELEC.HRS.3))

ggplot(df_ind_median, aes(x = TotalAssets, y = Income_median)) +
geom_point(size=2, color="steelblue", stroke=1) + facet_wrap(~ ELEC.HRS.3) + scale_y_continuous(labels = comma)

# Boxplots
#lets subset relevant columns
allvariables.1new<-allvariables.1 %>% select(c(STATEID,INCOME,NFE,ASSETS,ELEC.HRS.3,ELEC.HRS.4))
head(allvariables.1new)
## STATEID INCOME NFE ASSETS ELEC.HRS.3 ELEC.HRS.4
## 1 Jammu & Kashmir 01 176100 No 0 20 1-16 12-18
## 2 Jammu & Kashmir 01 1039150 No 0 24 1-16 6-12
## 3 Jammu & Kashmir 01 182340 No 0 22 1-16 6-12
## 4 Jammu & Kashmir 01 90760 No 0 16 1-16 6-12
## 5 Jammu & Kashmir 01 212600 No 0 17 1-16 6-12
## 6 Jammu & Kashmir 01 152100 No 0 12 1-16 12-18
#lets see the nos of NAs
apply(allvariables.1new, 2, function(x)sum(is.na(x)))
## STATEID INCOME NFE ASSETS ELEC.HRS.3 ELEC.HRS.4
## 0 0 0 18 5197 5197
df1<-which(apply(is.na(allvariables.1new),1,sum)>0)
#df1 # row nos that have NAs
length(df1) #no of rows to be deleted
## [1] 5209
df_new<-allvariables.1new[-df1,] #removing NAs
str(df_new)
## 'data.frame': 34745 obs. of 6 variables:
## $ STATEID : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ INCOME : num 176100 1039150 182340 90760 212600 ...
## $ NFE : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ASSETS : num 20 24 22 16 17 12 13 7 11 10 ...
## $ ELEC.HRS.3: Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
## $ ELEC.HRS.4: Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
#lets see the nos of NAs
apply(df_new, 2, function(x)sum(is.na(x)))
## STATEID INCOME NFE ASSETS ELEC.HRS.3 ELEC.HRS.4
## 0 0 0 0 0 0
ggplot(df_new)+geom_boxplot(aes(x=NFE, y=log(INCOME),fill=ELEC.HRS.4))

ggplot(df_new)+geom_boxplot(aes(x=NFE, y=log(INCOME),fill=ELEC.HRS.3))

p1 <- ggplot(df_new, aes(ELEC.HRS.3, ASSETS)) + geom_boxplot(aes(fill = ELEC.HRS.3), width=0.5, outlier.colour = "dodgerblue", outlier.size = 4, outlier.shape = 16, outlier.stroke = 2, notch=T) + labs(title="Box plot") # boxplot
p2 <- ggplot(df_new, aes(ELEC.HRS.3, ASSETS)) + geom_violin(aes(fill = ELEC.HRS.3), width=0.5, trim=F) + labs(title="Violin plot (untrimmed)") # violin plot
gridExtra::grid.arrange(p1, p2, ncol=2)

p3 <- ggplot(df_new, aes(NFE, ASSETS)) + geom_boxplot(aes(fill = NFE), width=0.5, outlier.colour = "dodgerblue", outlier.size = 4, outlier.shape = 16, outlier.stroke = 2, notch=T) + labs(title="Box plot") # boxplot
p4 <- ggplot(df_new, aes(NFE, ASSETS)) + geom_violin(aes(fill = NFE), width=0.5, trim=F) + labs(title="Violin plot (untrimmed)") # violin plot
gridExtra::grid.arrange(p3, p4, ncol=2)

library(extracat)
rmb(formula = ~ NFE+ ELEC.HRS.3+ ELEC.HRS.4, data = df_new)

barplot(table(df_new$NFE,log(df_new$INCOME)),legend.text = TRUE,main = "Log(Income) by NFE",xlab = "Log(Income)")

barplot(table(df_new$NFE,df_new$ASSETS),legend.text = TRUE,main = "Assets by NFE",xlab = "Assets")

ggplot(df_new)+
geom_bar( aes(x = ASSETS , fill = NFE))+
theme_bw()+
theme(panel.border = element_blank(),
axis.line = element_line(color = "black"))+
ggtitle("Assets by NFE")+
scale_colour_manual(name = "NFE", values = c("#11c2d7", "#9f0303"))

barplot(table(df_new$ELEC.HRS.3,df_new$ASSETS),legend.text = TRUE,main = "Assets by ELEC.HRS.3",xlab = "Assets")

barplot(table(df_new$ELEC.HRS.4,df_new$ASSETS),legend.text = TRUE,main = "Assets by ELEC.HRS.4",xlab = "Assets")

ggplot(df_new)+
geom_bar( aes(x = ASSETS , fill = ELEC.HRS.4))+
theme_bw()+
theme(panel.border = element_blank(),
axis.line = element_line(color = "black"))+ ggtitle("Assets by ELEC.HRS.4")

barplot(table(df_new$NFE,df_new$ELEC.HRS.4),legend.text = TRUE,main = "ELEC.HRS.4 by NFE",xlab = "ELEC.HRS.4 ")


mosaicplot(table(df_new$NFE,df_new$ELEC.HRS.4),xlab = "NFE",ylab="ELEC.HRS.4")

df_new %>%
ggplot(aes(x = ASSETS, y = log(INCOME))) +
geom_boxplot(aes(group=ASSETS)) +
geom_point(aes(color=ELEC.HRS.3)) +
geom_smooth(method="lm", aes(color=ELEC.HRS.3)) +
labs(title = "Assets vs. log(INCOME) grouped by ELEC.HRS.3 ") +
theme(plot.title=element_text(size=20))

ggplot(allvariables.1nonas,
aes(x = ASSETS, y = ROOMS , color = ELEC.HRS.3)) +
geom_line() +
ggtitle("Assets vs. Rooms grouped by ELEC.HRS.3")

ggplot(allvariables.1nonas,
aes(x = RICE.P, y = ROOMS , color = ELEC.HRS.3)) +
geom_line() +
ggtitle("RICE.P vs. ROOMS grouped by ELEC.HRS.3")

ggplot(allvariables.1nonas,
aes(x = RICE.P, y = ROOMS , color =NFE)) +
geom_line() +
ggtitle("RICE.P vs. ROOMS grouped by NFE")

#############################
library(beanplot)
library(RColorBrewer)
bean.cols <- lapply(brewer.pal(6, "Set3"),
function(x) {return(c(x, "black", "black", "black"))})
beanplot(ASSETS ~ ELEC.HRS.4,
data = df_new,
main = "Relationship between ELEC.HRS.4 and ASSETS",
xlab = "ELEC.HRS.4",
ylab = "ASSETS",
col = bean.cols ,lwd = 1,what = c(1, 1, 1, 1))

# In one panel
#lets subset relevant columns
allvariables.1box<-allvariables.1 %>% select(c(STATEID,INCOME,ELEC.ACCESS,NFE,URBAN2011,ELEC.HRS.3,ELEC.HRS.4))
head(allvariables.1box)
## STATEID INCOME ELEC.ACCESS NFE URBAN2011 ELEC.HRS.3
## 1 Jammu & Kashmir 01 176100 Yes 1 No 0 rural 0 1-16
## 2 Jammu & Kashmir 01 1039150 Yes 1 No 0 rural 0 1-16
## 3 Jammu & Kashmir 01 182340 Yes 1 No 0 rural 0 1-16
## 4 Jammu & Kashmir 01 90760 Yes 1 No 0 rural 0 1-16
## 5 Jammu & Kashmir 01 212600 Yes 1 No 0 rural 0 1-16
## 6 Jammu & Kashmir 01 152100 Yes 1 No 0 rural 0 1-16
## ELEC.HRS.4
## 1 12-18
## 2 6-12
## 3 6-12
## 4 6-12
## 5 6-12
## 6 12-18
#lets see the nos of NAs
apply(allvariables.1box, 2, function(x)sum(is.na(x)))
## STATEID INCOME ELEC.ACCESS NFE URBAN2011 ELEC.HRS.3
## 0 0 144 0 0 5197
## ELEC.HRS.4
## 5197
df2<-which(apply(is.na(allvariables.1box[,c(1,2,3,4)]),1,sum)>0)
#df1 # row nos that have NAs
length(df2) #no of rows to be deleted
## [1] 144
df_g1<-allvariables.1box[-df2,] #removing NAs
str(df_g1)
## 'data.frame': 39810 obs. of 7 variables:
## $ STATEID : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ INCOME : num 176100 1039150 182340 90760 212600 ...
## $ ELEC.ACCESS: Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
## $ NFE : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ URBAN2011 : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
## $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
#lets see the nos of NAs
apply(df_g1, 2, function(x)sum(is.na(x)))
## STATEID INCOME ELEC.ACCESS NFE URBAN2011 ELEC.HRS.3
## 0 0 0 0 0 5055
## ELEC.HRS.4
## 5055
g1<-ggplot(data = df_g1, aes(x = ELEC.ACCESS, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0)
g2<-ggplot(data = df_g1, aes(x = NFE, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0)
g3<-ggplot(data =df_g1, aes(x = URBAN2011, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0)
#lets subset relevant columns
allvariables.1box2<-allvariables.1 %>% select(c(STATEID,INCOME,ELEC.ACCESS,NFE,URBAN2011,ELEC.HRS.3,ELEC.HRS.4))
head(allvariables.1box)
## STATEID INCOME ELEC.ACCESS NFE URBAN2011 ELEC.HRS.3
## 1 Jammu & Kashmir 01 176100 Yes 1 No 0 rural 0 1-16
## 2 Jammu & Kashmir 01 1039150 Yes 1 No 0 rural 0 1-16
## 3 Jammu & Kashmir 01 182340 Yes 1 No 0 rural 0 1-16
## 4 Jammu & Kashmir 01 90760 Yes 1 No 0 rural 0 1-16
## 5 Jammu & Kashmir 01 212600 Yes 1 No 0 rural 0 1-16
## 6 Jammu & Kashmir 01 152100 Yes 1 No 0 rural 0 1-16
## ELEC.HRS.4
## 1 12-18
## 2 6-12
## 3 6-12
## 4 6-12
## 5 6-12
## 6 12-18
#lets see the nos of NAs
apply(allvariables.1box, 2, function(x)sum(is.na(x)))
## STATEID INCOME ELEC.ACCESS NFE URBAN2011 ELEC.HRS.3
## 0 0 144 0 0 5197
## ELEC.HRS.4
## 5197
df3<-which(apply(is.na(allvariables.1box[,c(1,2,4,5,6,7)]),1,sum)>0)
#df1 # row nos that have NAs
length(df3) #no of rows to be deleted
## [1] 5197
df_g4<-allvariables.1box[-df3,] #removing NAs
str(df_g4)
## 'data.frame': 34757 obs. of 7 variables:
## $ STATEID : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ INCOME : num 176100 1039150 182340 90760 212600 ...
## $ ELEC.ACCESS: Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
## $ NFE : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ URBAN2011 : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
## $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
#lets see the nos of NAs
apply(df_g4, 2, function(x)sum(is.na(x)))
## STATEID INCOME ELEC.ACCESS NFE URBAN2011 ELEC.HRS.3
## 0 0 2 0 0 0
## ELEC.HRS.4
## 0
g4<-ggplot(data = df_g4, aes(x = ELEC.HRS.3, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0)
g5<-ggplot(data = df_g4, aes(x = ELEC.HRS.4, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0)
g6<-ggplot(data = df_g4, aes(x = STATEID, y = log(INCOME))) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(alpha = 0) + theme(axis.text= element_text(size=6, angle=90))
library(gridExtra)
grid.arrange(g1,g2,g3,g4,g5,g6,ncol=2)

#Scatterplots
#lets subset relevant columns
allvariables.1scat<-allvariables.1 %>% select(c(STATEID,INCOME,ELEC.ACCESS,ELEC.HRS,NFE,ASSETS,URBAN2011,ELEC.HRS.3,ELEC.HRS.4))
head(allvariables.1scat)
## STATEID INCOME ELEC.ACCESS ELEC.HRS NFE ASSETS URBAN2011
## 1 Jammu & Kashmir 01 176100 Yes 1 12 No 0 20 rural 0
## 2 Jammu & Kashmir 01 1039150 Yes 1 8 No 0 24 rural 0
## 3 Jammu & Kashmir 01 182340 Yes 1 8 No 0 22 rural 0
## 4 Jammu & Kashmir 01 90760 Yes 1 8 No 0 16 rural 0
## 5 Jammu & Kashmir 01 212600 Yes 1 8 No 0 17 rural 0
## 6 Jammu & Kashmir 01 152100 Yes 1 14 No 0 12 rural 0
## ELEC.HRS.3 ELEC.HRS.4
## 1 1-16 12-18
## 2 1-16 6-12
## 3 1-16 6-12
## 4 1-16 6-12
## 5 1-16 6-12
## 6 1-16 12-18
#lets see the nos of NAs
apply(allvariables.1scat, 2, function(x)sum(is.na(x)))
## STATEID INCOME ELEC.ACCESS ELEC.HRS NFE ASSETS
## 0 0 144 5197 0 18
## URBAN2011 ELEC.HRS.3 ELEC.HRS.4
## 0 5197 5197
df4<-which(apply(is.na(allvariables.1scat[,c(1,2,3)]),1,sum)>0)
#df1 # row nos that have NAs
length(df4) #no of rows to be deleted
## [1] 144
df_g5<-allvariables.1scat[-df4,] #removing NAs
str(df_g5)
## 'data.frame': 39810 obs. of 9 variables:
## $ STATEID : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ INCOME : num 176100 1039150 182340 90760 212600 ...
## $ ELEC.ACCESS: Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ELEC.HRS : num 12 8 8 8 8 14 3 22 22 8 ...
## $ NFE : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ASSETS : num 20 24 22 16 17 12 13 7 11 10 ...
## $ URBAN2011 : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
## $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
#lets see the nos of NAs
apply(df_g5, 2, function(x)sum(is.na(x)))
## STATEID INCOME ELEC.ACCESS ELEC.HRS NFE ASSETS
## 0 0 0 5055 0 12
## URBAN2011 ELEC.HRS.3 ELEC.HRS.4
## 0 5055 5055
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 22 70 264 467 906 1245 2125 783 2653 713 2123 152 2487 246 583
## 15 16 17 18 19 20 21 22 23 24
## 1272 1313 351 2535 330 3882 624 3273 2155 4181
table(df_g5$NFE,df_g5$ELEC.ACCESS,df_g5$ELEC.HRS.3)
## , , = No Access
##
##
## No 0 Yes 1
## No 0 0 20
## Yes 1 0 2
##
## , , = 1-16
##
##
## No 0 Yes 1
## No 0 0 12735
## Yes 1 0 3354
##
## , , = 17-24
##
##
## No 0 Yes 1
## No 0 0 14252
## Yes 1 0 4392
ggplot(df_g5,aes(ELEC.HRS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.ACCESS , ncol=2) + ggtitle("Elect Hours and Income grouped by ELEC.ACCESS") + scale_y_continuous(labels = comma)

ggplot(df_g5,aes(NFE,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.ACCESS , ncol=2) + ggtitle("NFE and Income grouped by ELEC.ACCESS") + scale_y_continuous(labels = comma)

## 'data.frame': 34442 obs. of 26 variables:
## $ STATEID : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ DISTID : num 2 2 2 2 2 2 2 2 2 2 ...
## $ VNID : num 1 1 1 1 1 1 1 1 1 1 ...
## $ ROOMS : num 12 10 3 4 10 5 5 2 7 2 ...
## $ INCOME : num 176100 1039150 182340 90760 212600 ...
## $ EDU.HH : Factor w/ 18 levels "none 0","1st class 1",..: 1 1 11 1 1 1 1 1 5 1 ...
## $ WATER : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 1 2 2 ...
## $ RICE.P : num 17 20 15 20 20 12 25 25 12 25 ...
## $ ELEC.ACCESS : Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ELEC.HRS : num 12 8 8 8 8 14 3 22 22 8 ...
## $ NFE : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ASSETS : num 20 24 22 16 17 12 13 7 11 10 ...
## $ NADULTM : num 2 5 1 1 3 4 2 1 1 1 ...
## $ NADULTF : num 2 3 2 1 3 3 2 1 1 1 ...
## $ NCHILDM : num 2 3 1 1 1 2 0 4 3 2 ...
## $ NCHILDF : num 2 3 1 1 1 2 0 1 0 0 ...
## $ NTEENM : num 1 1 1 0 2 0 0 0 2 0 ...
## $ NTEENF : num 0 1 1 1 0 0 4 1 1 0 ...
## $ URBAN2011 : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ NADULT : num 4 8 3 2 6 7 4 2 2 2 ...
## $ NCHILD : num 4 6 2 2 2 4 0 5 3 2 ...
## $ NTEEN : num 1 2 2 1 2 0 4 1 3 0 ...
## $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
## $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
## $ filter_. : Factor w/ 2 levels "Not Selected",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ ELEC.HRS.3.NEW: Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:5512] 42 148 189 198 221 234 270 272 274 278 ...
## .. ..- attr(*, "names")= chr [1:5512] "42" "148" "189" "198" ...
ggplot(allvariables.1nonas,aes(ELEC.HRS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.HRS.4 , ncol=2) + ggtitle("Elect Hours and Income grouped by ELEC.HRS.4") + scale_y_continuous(labels = comma)

ggplot(allvariables.1nonas, aes(ASSETS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.HRS.3 , ncol=3) + ggtitle("Assets and Income grouped by ELEC.HRS.3") + scale_y_continuous(labels = comma)

ggplot(allvariables.1nonas,aes(ELEC.HRS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~NFE , ncol=2) + ggtitle("Elect Hours and Income grouped by NFE") + scale_y_continuous(labels = comma)

ggplot(allvariables.1nonas, aes(ASSETS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~ELEC.HRS.4 , ncol=4) + ggtitle("Assets and Income grouped by ELEC.HRS.4") + scale_y_continuous(labels = comma)

ggplot(allvariables.1nonas, aes(ASSETS,INCOME))+geom_point(color="aquamarine4")+ facet_wrap(~URBAN2011 + ELEC.HRS.4, ncol=4) + ggtitle("Assets and Income grouped by ELEC.HRS.4 & URBAN2011") + scale_y_continuous(labels=function(n){format(n, scientific = FALSE)})

ggplot(df_g5, aes(x=ELEC.ACCESS,y=INCOME))+geom_point(aes(colour=STATEID))+ facet_wrap(~STATEID)+scale_y_continuous(labels = comma)

ggplot(df_g5, aes(x=ASSETS,y=log(INCOME)))+geom_point(aes(colour=STATEID)) + facet_wrap(~STATEID)+stat_smooth()

ggplot(df_g5)+geom_histogram(aes(x=log(INCOME), fill=ELEC.HRS.4)) + theme_grey()

ggplot(df_g5)+geom_histogram(aes(x=log(INCOME), fill=ELEC.HRS.4), position="dodge")

ggplot(df_g4)+geom_density(aes(x=log(INCOME), colour=ELEC.HRS.4))

ggplot(df_g4)+geom_density(aes(x=log(INCOME), fill=ELEC.HRS.4))

###################################
#extract only numeric columns from data frame
library(dplyr)
new_df <- allvariables.1[sapply(allvariables.1,is.numeric)]
new_df<-na.omit(new_df)
dim(new_df)
## [1] 34719 16
## 'data.frame': 34719 obs. of 16 variables:
## $ DISTID : num 2 2 2 2 2 2 2 2 2 2 ...
## $ VNID : num 1 1 1 1 1 1 1 1 1 1 ...
## $ ROOMS : num 12 10 3 4 10 5 5 2 7 2 ...
## $ INCOME : num 176100 1039150 182340 90760 212600 ...
## $ RICE.P : num 17 20 15 20 20 12 25 25 12 25 ...
## $ ELEC.HRS: num 12 8 8 8 8 14 3 22 22 8 ...
## $ ASSETS : num 20 24 22 16 17 12 13 7 11 10 ...
## $ NADULTM : num 2 5 1 1 3 4 2 1 1 1 ...
## $ NADULTF : num 2 3 2 1 3 3 2 1 1 1 ...
## $ NCHILDM : num 2 3 1 1 1 2 0 4 3 2 ...
## $ NCHILDF : num 2 3 1 1 1 2 0 1 0 0 ...
## $ NTEENM : num 1 1 1 0 2 0 0 0 2 0 ...
## $ NTEENF : num 0 1 1 1 0 0 4 1 1 0 ...
## $ NADULT : num 4 8 3 2 6 7 4 2 2 2 ...
## $ NCHILD : num 4 6 2 2 2 4 0 5 3 2 ...
## $ NTEEN : num 1 2 2 1 2 0 4 1 3 0 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:5235] 309 500 799 1187 1352 1572 1622 1955 1967 2161 ...
## .. ..- attr(*, "names")= chr [1:5235] "309" "500" "799" "1187" ...
#Interactive Data Tables
library(DT)
datatable(new_df, options = list(pageLength = 5))
library(knitr)
kable(head(new_df))
| 2 |
1 |
12 |
176100 |
17 |
12 |
20 |
2 |
2 |
2 |
2 |
1 |
0 |
4 |
4 |
1 |
| 2 |
1 |
10 |
1039150 |
20 |
8 |
24 |
5 |
3 |
3 |
3 |
1 |
1 |
8 |
6 |
2 |
| 2 |
1 |
3 |
182340 |
15 |
8 |
22 |
1 |
2 |
1 |
1 |
1 |
1 |
3 |
2 |
2 |
| 2 |
1 |
4 |
90760 |
20 |
8 |
16 |
1 |
1 |
1 |
1 |
0 |
1 |
2 |
2 |
1 |
| 2 |
1 |
10 |
212600 |
20 |
8 |
17 |
3 |
3 |
1 |
1 |
2 |
0 |
6 |
2 |
2 |
| 2 |
1 |
5 |
152100 |
12 |
14 |
12 |
4 |
3 |
2 |
2 |
0 |
0 |
7 |
4 |
0 |
colnos<-which(apply(is.na(new_df),2,sum)>0)
colnos
## named integer(0)
## DISTID VNID ROOMS INCOME RICE.P ELEC.HRS ASSETS
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## NADULTM NADULTF NCHILDM NCHILDF NTEENM NTEENF NADULT
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## NCHILD NTEEN
## "numeric" "numeric"
## DISTID VNID ROOMS INCOME RICE.P
## 1.459187e+01 5.799677e+00 2.792678e+00 1.361545e+05 2.260492e+01
## ELEC.HRS ASSETS NADULTM NADULTF NCHILDM
## 1.532054e+01 1.668634e+01 1.463925e+00 1.528903e+00 6.913794e-01
## NCHILDF NTEENM NTEENF NADULT NCHILD
## 6.300873e-01 2.871338e-01 2.926928e-01 2.992828e+00 1.321467e+00
## NTEEN
## 5.798266e-01
#Dot plots
df6<-which(apply(is.na(allvariables.1scat[,c(1:9)]),1,sum)>0)
#df1 # row nos that have NAs
length(df6) #no of rows to be deleted
## [1] 5211
df_g7<-allvariables.1scat[-df6,] #removing NAs
str(df_g7)
## 'data.frame': 34743 obs. of 9 variables:
## $ STATEID : Factor w/ 35 levels "Jammu & Kashmir 01",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ INCOME : num 176100 1039150 182340 90760 212600 ...
## $ ELEC.ACCESS: Factor w/ 2 levels "No 0","Yes 1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ELEC.HRS : num 12 8 8 8 8 14 3 22 22 8 ...
## $ NFE : Factor w/ 2 levels "No 0","Yes 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ASSETS : num 20 24 22 16 17 12 13 7 11 10 ...
## $ URBAN2011 : Factor w/ 2 levels "rural 0","urban 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ELEC.HRS.3 : Factor w/ 3 levels "No Access","1-16",..: 2 2 2 2 2 2 2 3 3 2 ...
## $ ELEC.HRS.4 : Factor w/ 4 levels "0-6","6-12","12-18",..: 3 2 2 2 2 3 1 4 4 2 ...
#lets see the nos of NAs
apply(df_g7, 2, function(x)sum(is.na(x)))
## STATEID INCOME ELEC.ACCESS ELEC.HRS NFE ASSETS
## 0 0 0 0 0 0
## URBAN2011 ELEC.HRS.3 ELEC.HRS.4
## 0 0 0
ggplot(df_g7, aes(x = ELEC.HRS)) + geom_dotplot(dotsize = 0.4,col="darkred") + facet_grid(~ELEC.HRS.3) + ggtitle("Dot plot of ELEC.HRS grouped by ELEC.HRS.3 ")

ggplot(df_g7, aes(x = ELEC.HRS)) + geom_dotplot(dotsize = 0.4,col="darkred") + facet_grid(~ELEC.HRS.4) + ggtitle("Dot plot of ELEC.HRS grouped by ELEC.HRS.3 ")

#lets see the distribution of var Income
# Histogram after some cleaning(remove very high values, seem like errors)
ggplot(df_g5, aes(INCOME)) + geom_histogram(color = "white",bins = 40) + theme(axis.text.x = element_text(angle = 90, hjust = 1,size = 8)) + scale_x_continuous(breaks = seq(-100000, 15000000, 1000000),labels = comma)

ggplot(df_g5)+geom_histogram(aes(x=INCOME),fill="darkgreen")+ theme_grey()

#lets see how distribution changes across differnt factor levels
library(mosaic)
histogram(~ INCOME | ELEC.ACCESS, layout=c(1, 2), data=df_g5,main="Income by ELEC.ACCESS",col="darkgreen")

histogram(~ ASSETS | ELEC.ACCESS, layout=c(1, 2), data=df_g5 ,main="Assets by ELEC.ACCESS",col="darkgreen")

histogram(~ ASSETS | URBAN2011, layout=c(1, 2), data=df_g5 ,main="Assets by URBAN2011",col="darkgreen")

histogram(~ ASSETS | ELEC.HRS.3, data=df_g5 ,main="Assets by ELEC.HRS.3",col="darkgreen")

histogram(~ ASSETS | ELEC.HRS.4, data=df_g5 ,main="Assets by ELEC.HRS.4",col="darkgreen")

histogram(~ ASSETS | NFE, data=df_g5 ,main="Assets by NFE",col="darkgreen")

histogram(~ ASSETS | STATEID, data=df_g5 ,main="Assets by STATEID",col="darkgreen")

histogram(~ INCOME | STATEID, data=df_g5 ,main="INCOME by STATEID",col="darkgreen")

#using ggplot2
# Faceted by ELEC.HRS.4
ggplot(df_g5, aes(ASSETS)) +
geom_histogram(color = "white", aes(fill = ELEC.HRS.4), bins = 30) +
theme(axis.text.x = element_text(angle = 90)) +
facet_grid(~ELEC.HRS.4) +
guides(fill = FALSE)

# Faceted by ELEC.HRS.4 and URBAN2011
ggplot(df_g5, aes(ASSETS)) +
geom_histogram(color = "white", aes(fill = ELEC.HRS.4), bins = 30) +
theme(axis.text.x = element_text(angle = 90)) +
facet_grid(URBAN2011~ELEC.HRS.4,scales = "free_y") +
guides(fill = FALSE)

ggplot(df_g5, aes(ASSETS)) +
geom_histogram(color = "white", aes(fill = ELEC.HRS.4), bins = 30) +
theme(axis.text.x = element_text(angle = 270)) +
facet_grid(URBAN2011~ELEC.HRS.4,scales = "free_y") + scale_x_continuous(breaks = seq(0, 40, 4)) + guides(fill = FALSE)

# using arg angle = 270 instead of angle = 90
#############################
library(Hmisc)
tmp <- new_df$INCOME
qqnorm(tmp)
qqline(tmp)
tmp2 <- subplot( hist(tmp,xlab='',ylab='',main=''),
cnvrt.coords(0.1,0.9,'plt')$usr, vadj=1, hadj=0 )
op <- par(no.readonly=TRUE)
par(tmp2)
abline(v=0, col='green')

#various ways to summarise
summary(new_df)
## DISTID VNID ROOMS INCOME
## Min. : 1.00 Min. : 1.0 Min. : 1.000 Min. :-1037040
## 1st Qu.: 6.00 1st Qu.: 2.0 1st Qu.: 2.000 1st Qu.: 43500
## Median :12.00 Median : 4.0 Median : 2.000 Median : 80590
## Mean :14.59 Mean : 5.8 Mean : 2.793 Mean : 136154
## 3rd Qu.:20.00 3rd Qu.: 7.0 3rd Qu.: 4.000 3rd Qu.: 154222
## Max. :68.00 Max. :39.0 Max. :50.000 Max. :11360000
## RICE.P ELEC.HRS ASSETS NADULTM
## Min. : 0.0 Min. : 0.00 Min. : 1.00 Min. :0.000
## 1st Qu.: 18.0 1st Qu.: 9.00 1st Qu.:12.00 1st Qu.:1.000
## Median : 22.0 Median :16.00 Median :17.00 Median :1.000
## Mean : 22.6 Mean :15.32 Mean :16.69 Mean :1.464
## 3rd Qu.: 27.0 3rd Qu.:22.00 3rd Qu.:21.00 3rd Qu.:2.000
## Max. :120.0 Max. :24.00 Max. :33.00 Max. :9.000
## NADULTF NCHILDM NCHILDF NTEENM
## Min. :0.000 Min. :0.0000 Min. : 0.0000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.0000
## Median :1.000 Median :0.0000 Median : 0.0000 Median :0.0000
## Mean :1.529 Mean :0.6914 Mean : 0.6301 Mean :0.2871
## 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.: 1.0000 3rd Qu.:0.0000
## Max. :9.000 Max. :8.0000 Max. :10.0000 Max. :5.0000
## NTEENF NADULT NCHILD NTEEN
## Min. :0.0000 Min. : 0.000 Min. : 0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.:0.0000
## Median :0.0000 Median : 3.000 Median : 1.000 Median :0.0000
## Mean :0.2927 Mean : 2.993 Mean : 1.321 Mean :0.5798
## 3rd Qu.:0.0000 3rd Qu.: 4.000 3rd Qu.: 2.000 3rd Qu.:1.0000
## Max. :5.0000 Max. :18.000 Max. :18.000 Max. :7.0000
library(Hmisc)
describe(new_df)
## new_df
##
## 16 Variables 34719 Observations
## ---------------------------------------------------------------------------
## DISTID
## n missing unique Info Mean .05 .10 .25 .50
## 34719 0 60 1 14.59 2 3 6 12
## .75 .90 .95
## 20 29 34
##
## lowest : 1 2 3 4 5, highest: 63 65 66 67 68
## ---------------------------------------------------------------------------
## VNID
## n missing unique Info Mean .05 .10 .25 .50
## 34719 0 39 0.99 5.8 1 1 2 4
## .75 .90 .95
## 7 11 15
##
## lowest : 1 2 3 4 5, highest: 35 36 37 38 39
## ---------------------------------------------------------------------------
## ROOMS
## n missing unique Info Mean .05 .10 .25 .50
## 34719 0 25 0.94 2.793 1 1 2 2
## .75 .90 .95
## 4 5 6
##
## lowest : 1 2 3 4 5, highest: 23 24 25 26 50
## ---------------------------------------------------------------------------
## INCOME
## n missing unique Info Mean .05 .10 .25 .50
## 34719 0 13698 1 136154 12000 22000 43500 80590
## .75 .90 .95
## 154223 289606 423000
##
## lowest : -1037040 -867025 -245000 -214475 -208138
## highest: 8096550 8322000 9563500 11169820 11360000
## ---------------------------------------------------------------------------
## RICE.P
## n missing unique Info Mean .05 .10 .25 .50
## 34719 0 96 0.99 22.6 10 15 18 22
## .75 .90 .95
## 27 32 35
##
## lowest : 0.0 0.5 1.0 2.0 2.5
## highest: 80.0 90.0 95.0 100.0 120.0
## ---------------------------------------------------------------------------
## ELEC.HRS
## n missing unique Info Mean .05 .10 .25 .50
## 34719 0 25 0.99 15.32 5 6 9 16
## .75 .90 .95
## 22 24 24
##
## lowest : 0 1 2 3 4, highest: 20 21 22 23 24
## ---------------------------------------------------------------------------
## ASSETS
## n missing unique Info Mean .05 .10 .25 .50
## 34719 0 33 1 16.69 7 8 12 17
## .75 .90 .95
## 21 24 26
##
## lowest : 1 2 3 4 5, highest: 29 30 31 32 33
## ---------------------------------------------------------------------------
## NADULTM
## n missing unique Info Mean .05 .10 .25 .50
## 34719 0 10 0.81 1.464 0 1 1 1
## .75 .90 .95
## 2 3 3
##
## 0 1 2 3 4 5 6 7 8 9
## Frequency 2411 19192 8961 3216 716 160 49 8 4 2
## % 7 55 26 9 2 0 0 0 0 0
## ---------------------------------------------------------------------------
## NADULTF
## n missing unique Info Mean .05 .10 .25 .50
## 34719 0 10 0.79 1.529 1 1 1 1
## .75 .90 .95
## 2 3 3
##
## 0 1 2 3 4 5 6 7 8 9
## Frequency 687 19611 10772 2853 651 117 20 6 1 1
## % 2 56 31 8 2 0 0 0 0 0
## ---------------------------------------------------------------------------
## NCHILDM
## n missing unique Info Mean
## 34719 0 9 0.83 0.6914
##
## 0 1 2 3 4 5 6 7 8
## Frequency 18125 10809 4549 943 217 60 13 2 1
## % 52 31 13 3 1 0 0 0 0
## ---------------------------------------------------------------------------
## NCHILDF
## n missing unique Info Mean .05 .10 .25 .50
## 34719 0 11 0.78 0.6301 0 0 0 0
## .75 .90 .95
## 1 2 2
##
## 0 1 2 3 4 5 6 7 8 9 10
## Frequency 20068 9543 3606 1069 305 90 28 7 1 1 1
## % 58 27 10 3 1 0 0 0 0 0 0
## ---------------------------------------------------------------------------
## NTEENM
## n missing unique Info Mean
## 34719 0 6 0.55 0.2871
##
## 0 1 2 3 4 5
## Frequency 26402 6801 1390 117 8 1
## % 76 20 4 0 0 0
## ---------------------------------------------------------------------------
## NTEENF
## n missing unique Info Mean
## 34719 0 6 0.56 0.2927
##
## 0 1 2 3 4 5
## Frequency 26340 6803 1394 159 21 2
## % 76 20 4 0 0 0
## ---------------------------------------------------------------------------
## NADULT
## n missing unique Info Mean .05 .10 .25 .50
## 34719 0 17 0.91 2.993 1 2 2 3
## .75 .90 .95
## 4 5 6
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 16
## Frequency 32 2157 14304 8014 5621 2442 1321 452 217 83 40 18 8 6 2 1
## % 0 6 41 23 16 7 4 1 1 0 0 0 0 0 0 0
## 18
## Frequency 1
## % 0
## ---------------------------------------------------------------------------
## NCHILD
## n missing unique Info Mean .05 .10 .25 .50
## 34719 0 14 0.92 1.321 0 0 0 1
## .75 .90 .95
## 2 3 4
##
## 0 1 2 3 4 5 6 7 8 9 10 11 15 18
## Frequency 13008 7641 7948 3679 1529 565 201 83 34 18 10 1 1 1
## % 37 22 23 11 4 2 1 0 0 0 0 0 0 0
## ---------------------------------------------------------------------------
## NTEEN
## n missing unique Info Mean
## 34719 0 8 0.76 0.5798
##
## 0 1 2 3 4 5 6 7
## Frequency 21195 8286 4070 998 145 21 2 2
## % 61 24 12 3 0 0 0 0
## ---------------------------------------------------------------------------
options(scipen = T)
library(fBasics)
kable(basicStats(new_df),digits =2)
| nobs |
34719.00 |
34719.00 |
34719.00 |
34719.00 |
34719.00 |
34719.00 |
34719.00 |
34719.00 |
34719.00 |
34719.00 |
34719.00 |
34719.00 |
34719.00 |
34719.00 |
34719.00 |
34719.00 |
| NAs |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
| Minimum |
1.00 |
1.00 |
1.00 |
-1037040.00 |
0.00 |
0.00 |
1.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
| Maximum |
68.00 |
39.00 |
50.00 |
11360000.00 |
120.00 |
24.00 |
33.00 |
9.00 |
9.00 |
8.00 |
10.00 |
5.00 |
5.00 |
18.00 |
18.00 |
7.00 |
| 1. Quartile |
6.00 |
2.00 |
2.00 |
43500.00 |
18.00 |
9.00 |
12.00 |
1.00 |
1.00 |
0.00 |
0.00 |
0.00 |
0.00 |
2.00 |
0.00 |
0.00 |
| 3. Quartile |
20.00 |
7.00 |
4.00 |
154222.50 |
27.00 |
22.00 |
21.00 |
2.00 |
2.00 |
1.00 |
1.00 |
0.00 |
0.00 |
4.00 |
2.00 |
1.00 |
| Mean |
14.59 |
5.80 |
2.79 |
136154.48 |
22.60 |
15.32 |
16.69 |
1.46 |
1.53 |
0.69 |
0.63 |
0.29 |
0.29 |
2.99 |
1.32 |
0.58 |
| Median |
12.00 |
4.00 |
2.00 |
80590.00 |
22.00 |
16.00 |
17.00 |
1.00 |
1.00 |
0.00 |
0.00 |
0.00 |
0.00 |
3.00 |
1.00 |
0.00 |
| Sum |
506615.00 |
201359.00 |
96959.00 |
4727147456.95 |
784820.30 |
531914.00 |
579333.00 |
50826.00 |
53082.00 |
24004.00 |
21876.00 |
9969.00 |
10162.00 |
103908.00 |
45880.00 |
20131.00 |
| SE Mean |
0.06 |
0.03 |
0.01 |
1227.71 |
0.05 |
0.04 |
0.03 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.01 |
0.01 |
0.00 |
| LCL Mean |
14.47 |
5.74 |
2.77 |
133748.14 |
22.51 |
15.25 |
16.62 |
1.45 |
1.52 |
0.68 |
0.62 |
0.28 |
0.29 |
2.98 |
1.31 |
0.57 |
| UCL Mean |
14.71 |
5.86 |
2.81 |
138560.83 |
22.70 |
15.39 |
16.75 |
1.47 |
1.54 |
0.70 |
0.64 |
0.29 |
0.30 |
3.01 |
1.34 |
0.59 |
| Variance |
133.65 |
34.13 |
2.96 |
52330646462.37 |
80.08 |
46.21 |
36.27 |
0.80 |
0.63 |
0.76 |
0.82 |
0.31 |
0.32 |
2.05 |
1.93 |
0.72 |
| Stdev |
11.56 |
5.84 |
1.72 |
228758.93 |
8.95 |
6.80 |
6.02 |
0.89 |
0.79 |
0.87 |
0.91 |
0.56 |
0.57 |
1.43 |
1.39 |
0.85 |
| Skewness |
1.74 |
2.96 |
2.63 |
15.86 |
0.70 |
-0.28 |
-0.07 |
1.23 |
1.34 |
1.37 |
1.78 |
1.96 |
2.06 |
1.44 |
1.24 |
1.46 |
| Kurtosis |
4.42 |
11.09 |
25.80 |
549.95 |
5.82 |
-1.34 |
-0.69 |
2.80 |
2.79 |
2.31 |
4.35 |
3.74 |
4.51 |
3.56 |
2.73 |
1.84 |
library(psych)
kable(describe(new_df),digits = 2)
| DISTID |
1 |
34719 |
14.59 |
11.56 |
12 |
13.06 |
10.38 |
1 |
68 |
67 |
1.74 |
4.42 |
0.06 |
| VNID |
2 |
34719 |
5.80 |
5.84 |
4 |
4.73 |
2.97 |
1 |
39 |
38 |
2.96 |
11.09 |
0.03 |
| ROOMS |
3 |
34719 |
2.79 |
1.72 |
2 |
2.56 |
1.48 |
1 |
50 |
49 |
2.63 |
25.80 |
0.01 |
| INCOME |
4 |
34719 |
136154.48 |
228758.93 |
80590 |
99396.43 |
68125.47 |
-1037040 |
11360000 |
12397040 |
15.86 |
549.95 |
1227.71 |
| RICE.P |
5 |
34719 |
22.60 |
8.95 |
22 |
22.48 |
5.93 |
0 |
120 |
120 |
0.70 |
5.82 |
0.05 |
| ELEC.HRS |
6 |
34719 |
15.32 |
6.80 |
16 |
15.62 |
8.90 |
0 |
24 |
24 |
-0.28 |
-1.34 |
0.04 |
| ASSETS |
7 |
34719 |
16.69 |
6.02 |
17 |
16.75 |
7.41 |
1 |
33 |
32 |
-0.07 |
-0.69 |
0.03 |
| NADULTM |
8 |
34719 |
1.46 |
0.89 |
1 |
1.37 |
0.00 |
0 |
9 |
9 |
1.23 |
2.80 |
0.00 |
| NADULTF |
9 |
34719 |
1.53 |
0.79 |
1 |
1.40 |
0.00 |
0 |
9 |
9 |
1.34 |
2.79 |
0.00 |
| NCHILDM |
10 |
34719 |
0.69 |
0.87 |
0 |
0.56 |
0.00 |
0 |
8 |
8 |
1.37 |
2.31 |
0.00 |
| NCHILDF |
11 |
34719 |
0.63 |
0.91 |
0 |
0.46 |
0.00 |
0 |
10 |
10 |
1.78 |
4.35 |
0.00 |
| NTEENM |
12 |
34719 |
0.29 |
0.56 |
0 |
0.17 |
0.00 |
0 |
5 |
5 |
1.96 |
3.74 |
0.00 |
| NTEENF |
13 |
34719 |
0.29 |
0.57 |
0 |
0.18 |
0.00 |
0 |
5 |
5 |
2.06 |
4.51 |
0.00 |
| NADULT |
14 |
34719 |
2.99 |
1.43 |
3 |
2.81 |
1.48 |
0 |
18 |
18 |
1.44 |
3.56 |
0.01 |
| NCHILD |
15 |
34719 |
1.32 |
1.39 |
1 |
1.13 |
1.48 |
0 |
18 |
18 |
1.24 |
2.73 |
0.01 |
| NTEEN |
16 |
34719 |
0.58 |
0.85 |
0 |
0.43 |
0.00 |
0 |
7 |
7 |
1.46 |
1.84 |
0.00 |
#scatterplot matrix
library(car)
scatterplotMatrix(new_df[,3:7],diagonal="histogram",smooth=FALSE)

library(gpairs)
gpairs(new_df[,3:7], upper.pars = list(scatter = 'stats'),stat.pars = list(verbose = FALSE))

#density plot
par(mfrow=c(3,3), mar=c(2.5,2,1.5,1.5))
colnames <- dimnames(new_df)[[2]]
for (i in 3:7) {
d <- density(new_df[,i])
plot(d, type="n", main=colnames[i])
polygon(d, col="red", border="gray")
}
par(mfrow=c(1,1))

my Files for reference:
**file:///J:/rstudio%20files/BoulderBCycle.html**