## Warning: package 'tidyverse' was built under R version 3.4.4
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.6
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tibble' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'readr' was built under R version 3.4.2
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.4
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## # A tibble: 16,290 x 21
## hhcode Region Province hhae AEDCC Calperday Cal_req Expenditure pcme
## <dbl> <fct> <fct> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
## 1 1.00e10 Urban Punjab 7.68 2141. 2141. 17700 33062. 3674.
## 2 1.00e10 Urban Punjab 5.64 2359. 2359. 13150 23449 3350.
## 3 1.00e10 Urban Punjab 5.34 1727. 1727. 12750 20479. 3413.
## 4 1.00e10 Urban Punjab 6.94 1525. 1525. 16350 18999. 2375.
## 5 1.00e10 Urban Punjab 5 2063. 2063. 12000 15032. 2505.
## 6 1.00e10 Urban Punjab 4.52 1691. 1691. 10800 12372. 2474.
## 7 1.00e10 Urban Punjab 6.10 2044. 2044. 13750 24363. 4060.
## 8 1.00e10 Urban Punjab 2.30 1803. 1803. 5350 8336. 4168.
## 9 1.00e10 Urban Punjab 4.06 2337. 2337. 9150 12245. 3061.
## 10 1.00e10 Urban Punjab 8.97 1571. 1571. 21520 25501. 2550.
## # ... with 16,280 more rows, and 12 more variables: pmfe <dbl>,
## # lnPCME <dbl>, lnPMFE <dbl>, pmcc <dbl>, lnPMCC <dbl>, DailyExp <dbl>,
## # DailyPro_Con <dbl>, MPCons <dbl>, DailyFat_Con <dbl>, MFCons <dbl>,
## # MCarCons <dbl>, DailyFE <dbl>
We shall explore income (Expenditure as a proxy for income) data using some descriptive statistics, graphs and applying some transformations. Income data are usually highly skewed and we shall plot histogram. We shall apply log transformation and will again look at histogram. To understand and getting insight of data it is important that number of bins (classes) should niether be too large nor too small. We shall show how variation in number of bins reveals different patterns. Histogram given in the following section, we can find out that data are highly skewed. Summarising such data usually should be through 5 number summary statistics i-e min, max, Q1, Q2, Q3 and box-plot is a nice way to do that. We shall make box-plot in the next section.
summary(cal_inc$Expenditure) ## Before we plot data we take a look at summary statistics
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1715 11657 16290 19846 23197 304284
## Summary Province wise
cal_inc %>%select(Expenditure,Province) %>% group_by(Province) %>% summarise(n(),mean(Expenditure),median(Expenditure),sd(Expenditure))
## Warning: package 'bindrcpp' was built under R version 3.4.4
## # A tibble: 4 x 5
## Province `n()` `mean(Expenditure)` `median(Expendit~ `sd(Expenditure~
## <fct> <int> <dbl> <dbl> <dbl>
## 1 Balochistan 2334 19006. 16676. 9655.
## 2 KP 2946 20189. 17106. 13485.
## 3 Punjab 6918 20314. 15993. 16391.
## 4 Sind 4092 19286. 15852. 14547.
## Plot histogram
ggplot(data=cal_inc,aes(x=Expenditure))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
A skewed histogram given above also indicate the need for transformation of data. So we shall plot histogram by using log of income data and we shall also use box-plot for getting an idea about outliers. Histogram and Boxplot are also very used for comparing data among regions or provinces and we shall carry out this exercise as this will provide very useful insight that how income varies within a province by having income distribution across different provinces.
ggplot(data=cal_inc,aes(x=Expenditure))+geom_histogram()+scale_x_log10()## histogram using log of total income
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Change bin_size to further explore the behavior of this data
ggplot(data=cal_inc,aes(x=log(Expenditure)))+geom_histogram(binwidth = 0.05)
## Kernel Density
ggplot(data=cal_inc,aes(x=log(Expenditure)))+geom_density()
## Fill it with Red
ggplot(data=cal_inc,aes(x=log(Expenditure)))+geom_density(fill='red')
##Fill it Province
ggplot(cal_inc,aes(x=log(Expenditure),fill=Province))+geom_density(alpha=0.4)
## Boxplot
ggplot(data=cal_inc,aes(x=Province,y=log(Expenditure)))+geom_boxplot()
ggplot(data=cal_inc,aes(x=Province,y=log(Expenditure),fill=Province))+
geom_boxplot()
## Bivariate Analysis Now we shall explore the relationship between calories consumption per day and income. First of all we shall have a scatter plot and will see what kind of this relationship it is? Is it linear , nonlinear at national/provincial/regional level.
ggplot(data=cal_inc,aes(x=Expenditure,y=Calperday))+geom_point()
## Urban and Rural
ggplot(data=cal_inc,aes(x=log(Expenditure),y=Calperday,color=Region))+geom_point()
##Province wise
ggplot(data=cal_inc,aes(x=log(Expenditure),y=Calperday,color=Province))+geom_point()
##
Balochistan<-cal_inc %>% filter(Province=='Balochistan')
## Balochistan
ggplot(data=Balochistan,aes(x=log(Expenditure),y=Calperday,color=Region))+geom_point()
## Smoothed Regression
ggplot(data=Balochistan,aes(x=log(Expenditure),y=Calperday))+geom_point()+
geom_smooth(method="lm", formula=y~x)
##Regionwise
ggplot(data=Balochistan,aes(x=log(Expenditure),y=Calperday,color=Region))+geom_point()+
geom_smooth(method="lm", formula=y~x)
ggplot(data=Balochistan,aes(x=log(Expenditure),y=Calperday,color=Region,shape=Region))+geom_point()+
geom_smooth(method="lm", formula=y~x)
## Sind
Sind<-cal_inc %>% filter(Province=='Sind') ## Using the filter we have selected data only for Sind Province
ggplot(data=Sind,aes(x=log(Expenditure),y=Calperday,color=Region))+geom_point()+
geom_smooth(method="lm", formula=y~x)+
theme(legend.position="none") # This removes the legend
## Another way of doing the same thing for each province is using facet
ggplot(data=cal_inc,aes(x=log(Expenditure),y=Calperday))+
facet_grid(Province~.)+geom_point()
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.