Basic Exploratory Analysis and Base plotting_Rudimentary usage of R

This is the first assignment of the Certificate in Computer Applications in Economic Analysis(CAEA) offered by Gokhale Institute of Politics & Economics(GIPE),Pune.

Module in R of the above CAEA course is being delivered by Dr.Savita Kulkarni from Symbiosis School Of Economics(SSE),Pune.

I have used R markdown in Rstudio for this assignment.Using R markdown helps me in knitting the code chunks as well as the output/plots.Also i can create html/pdf/word documents by knitting the rmd file created here.

I have used/applied various functions/packages for exploring the data.Its just to reapply what i have understood/learnt since i started the beautiful journey of exploring R.I have done more than what is required of the assignment.Many things would seem unnecessary for completing the assignment, but ultimately if one tortures the data,more insights can be gathered about the data at hand, which can be useful in later stages of further analysis/modeling.

1.Getting Data into R

#Importing the dataset from the current directory assuming that the csv file has already been saved in the current working directory.
rbigdp<-read.csv("rbigdp.csv",header = TRUE)

2.Exploratory analysis of the dataset using various functions/packages

#seeing the structure of the data set
str(rbigdp)
## 'data.frame':    54 obs. of  10 variables:
##  $ Quarter      : Factor w/ 4 levels "Q1","Q2","Q3",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Agriculture  : num  992 740 1370 1096 982 ...
##  $ Mining       : num  83.9 81.6 90.9 100.6 91.9 ...
##  $ Manufacturing: num  601 600 617 661 596 ...
##  $ Electricity  : num  89.4 88.8 92.8 95 95.4 ...
##  $ Construction : num  191 194 198 219 210 ...
##  $ Trade        : num  727 705 821 839 782 ...
##  $ Finance      : num  433 437 443 464 482 ...
##  $ Community    : num  444 438 457 674 447 ...
##  $ Gross        : num  3562 3284 4089 4149 3687 ...
#Other ways to get information about the dataset
names(rbigdp) #Gives the names of variables of the object
##  [1] "Quarter"       "Agriculture"   "Mining"        "Manufacturing"
##  [5] "Electricity"   "Construction"  "Trade"         "Finance"      
##  [9] "Community"     "Gross"
dim(rbigdp) #Gives the dimension of object (rows column)
## [1] 54 10
#seeing just the first and last 6 rows of the data set
head(rbigdp)
##   Quarter Agriculture Mining Manufacturing Electricity Construction  Trade
## 1      Q1      992.41  83.91        601.40       89.38       191.11 727.47
## 2      Q2      739.74  81.61        600.01       88.75       193.66 705.34
## 3      Q3     1369.57  90.90        616.90       92.81       198.27 821.00
## 4      Q4     1095.84 100.57        661.45       94.99       218.54 839.46
## 5      Q1      981.92  91.89        595.78       95.41       209.89 782.25
## 6      Q2      745.26  90.12        601.15       97.47       208.59 759.33
##   Finance Community   Gross
## 1  432.53    443.72 3561.93
## 2  437.01    438.13 3284.25
## 3  442.56    456.56 4088.57
## 4  464.16    674.00 4149.01
## 5  482.45    447.20 3686.80
## 6  482.01    467.46 3451.39
tail(rbigdp)
##    Quarter Agriculture Mining Manufacturing Electricity Construction
## 49      Q1     1318.49 149.41       1193.32      163.55       585.54
## 50      Q2     1052.14 143.21       1213.61      163.84       601.01
## 51      Q3     1811.97 167.33       1214.35      164.64       600.90
## 52      Q4     1477.83 182.50       1256.11      166.95       638.30
## 53      Q1     1349.68 161.19       1234.48      173.67       627.13
## 54      Q2     1061.16 156.88       1324.92      176.04       639.83
##      Trade Finance Community   Gross
## 49 2238.62 1169.02   1008.24 7826.20
## 50 2271.28 1187.39   1106.02 7738.50
## 51 2384.67 1250.67   1205.26 8799.80
## 52 2651.34 1326.48   1329.73 9029.24
## 53 2419.61 1263.22   1076.60 8305.57
## 54 2464.52 1278.44   1246.02 8347.80
summary(rbigdp,digits = 8)
##  Quarter  Agriculture            Mining          Manufacturing      
##  Q1:14   Min.   : 739.7400   Min.   : 81.61000   Min.   : 595.7800  
##  Q2:14   1st Qu.:1024.7200   1st Qu.:101.09250   1st Qu.: 666.3750  
##  Q3:13   Median :1149.8500   Median :117.38000   Median : 782.8450  
##  Q4:13   Mean   :1192.6080   Mean   :123.03037   Mean   : 861.4102  
##          3rd Qu.:1364.5975   3rd Qu.:142.08500   3rd Qu.:1055.6275  
##          Max.   :1827.3800   Max.   :182.50000   Max.   :1324.9200  
##   Electricity         Construction          Trade          
##  Min.   : 88.75000   Min.   :191.1100   Min.   : 705.3400  
##  1st Qu.:110.55000   1st Qu.:253.0150   1st Qu.: 966.0275  
##  Median :122.41500   Median :314.8950   Median :1268.8150  
##  Mean   :127.95352   Mean   :367.9585   Mean   :1428.4335  
##  3rd Qu.:148.50750   3rd Qu.:485.8875   3rd Qu.:1815.7450  
##  Max.   :176.04000   Max.   :639.8300   Max.   :2651.3400  
##     Finance            Community             Gross         
##  Min.   : 432.5300   Min.   : 438.1300   Min.   :3284.250  
##  1st Qu.: 576.0350   1st Qu.: 630.9725   1st Qu.:4437.705  
##  Median : 713.7000   Median : 797.1150   Median :5319.255  
##  Mean   : 778.4502   Mean   : 798.8476   Mean   :5678.695  
##  3rd Qu.: 970.3150   3rd Qu.: 932.4525   3rd Qu.:6881.235  
##  Max.   :1326.4800   Max.   :1329.7300   Max.   :9029.240
#############################

#scatter plot matrix
library(RColorBrewer)
plot(rbigdp[,2:10],col=brewer.pal(3,"Set1"))

####################################

#aliter way to get summary
library(stargazer)
stargazer(rbigdp,type = "text")
## 
## ========================================================
## Statistic     N    Mean    St. Dev.     Min       Max   
## --------------------------------------------------------
## Agriculture   54 1,192.608  261.931   739.740  1,827.380
## Mining        54  123.030   25.512    81.610    182.500 
## Manufacturing 54  861.410   225.374   595.780  1,324.920
## Electricity   54  127.954   24.220    88.750    176.040 
## Construction  54  367.959   142.115   191.110   639.830 
## Trade         54 1,428.434  557.657   705.340  2,651.340
## Finance       54  778.450   259.532   432.530  1,326.480
## Community     54  798.848   220.475   438.130  1,329.730
## Gross         54 5,678.695 1,583.311 3,284.250 9,029.240
## --------------------------------------------------------
#aliter way to get summary
library(pastecs)
round(stat.desc(rbigdp[,2:10]),4)
##              Agriculture    Mining Manufacturing Electricity Construction
## nbr.val          54.0000   54.0000       54.0000     54.0000      54.0000
## nbr.null          0.0000    0.0000        0.0000      0.0000       0.0000
## nbr.na            0.0000    0.0000        0.0000      0.0000       0.0000
## min             739.7400   81.6100      595.7800     88.7500     191.1100
## max            1827.3800  182.5000     1324.9200    176.0400     639.8300
## range          1087.6400  100.8900      729.1400     87.2900     448.7200
## sum           64400.8300 6643.6400    46516.1500   6909.4900   19869.7600
## median         1149.8500  117.3800      782.8450    122.4150     314.8950
## mean           1192.6080  123.0304      861.4102    127.9535     367.9585
## SE.mean          35.6442    3.4718       30.6695      3.2959      19.3394
## CI.mean.0.95     71.4933    6.9636       61.5152      6.6108      38.7899
## var           68607.5911  650.8851    50793.2583    586.6067   20196.6790
## std.dev         261.9305   25.5125      225.3736     24.2200     142.1150
## coef.var          0.2196    0.2074        0.2616      0.1893       0.3862
##                    Trade    Finance  Community        Gross
## nbr.val          54.0000    54.0000    54.0000      54.0000
## nbr.null          0.0000     0.0000     0.0000       0.0000
## nbr.na            0.0000     0.0000     0.0000       0.0000
## min             705.3400   432.5300   438.1300    3284.2500
## max            2651.3400  1326.4800  1329.7300    9029.2400
## range          1946.0000   893.9500   891.6000    5744.9900
## sum           77135.4100 42036.3100 43137.7700  306649.5100
## median         1268.8150   713.7000   797.1150    5319.2550
## mean           1428.4335   778.4502   798.8476    5678.6946
## SE.mean          75.8875    35.3179    30.0028     215.4613
## CI.mean.0.95    152.2111    70.8387    60.1781     432.1607
## var          310981.5413 67357.0104 48609.1615 2506873.7078
## std.dev         557.6572   259.5323   220.4749    1583.3110
## coef.var          0.3904     0.3334     0.2760       0.2788
#aliter way to get summary      
library(psych)
describe(rbigdp[,2:10])
##               vars  n    mean      sd  median trimmed     mad     min
## Agriculture      1 54 1192.61  261.93 1149.85 1181.08  257.68  739.74
## Mining           2 54  123.03   25.51  117.38  121.43   26.66   81.61
## Manufacturing    3 54  861.41  225.37  782.85  845.40  219.52  595.78
## Electricity      4 54  127.95   24.22  122.41  127.34   27.54   88.75
## Construction     5 54  367.96  142.12  314.89  358.21  129.95  191.11
## Trade            6 54 1428.43  557.66 1268.82 1384.68  551.44  705.34
## Finance          7 54  778.45  259.53  713.70  760.18  248.65  432.53
## Community        8 54  798.85  220.47  797.12  791.28  219.11  438.13
## Gross            9 54 5678.69 1583.31 5319.26 5588.52 1623.88 3284.25
##                   max   range skew kurtosis     se
## Agriculture   1827.38 1087.64 0.39    -0.37  35.64
## Mining         182.50  100.89 0.51    -0.69   3.47
## Manufacturing 1324.92  729.14 0.56    -1.13  30.67
## Electricity    176.04   87.29 0.28    -1.13   3.30
## Construction   639.83  448.72 0.57    -1.15  19.34
## Trade         2651.34 1946.00 0.59    -0.95  75.89
## Finance       1326.48  893.95 0.59    -0.92  35.32
## Community     1329.73  891.60 0.26    -0.61  30.00
## Gross         9029.24 5744.99 0.50    -0.95 215.46
#to know about the function use help or ?
#?stargazer

#######################

#get means of all variables i.e. column wise means in the data set
meanrbi<-sapply(rbigdp[,2:10],mean,na.rm=TRUE) #excluding the first column as it is a nominal data
meanrbi
##   Agriculture        Mining Manufacturing   Electricity  Construction 
##     1192.6080      123.0304      861.4102      127.9535      367.9585 
##         Trade       Finance     Community         Gross 
##     1428.4335      778.4502      798.8476     5678.6946
#therefore mean of the first three series are
meanrbi[1:3]
##   Agriculture        Mining Manufacturing 
##     1192.6080      123.0304      861.4102
#get variances of all variables in the data set except that of the 'Quarter' variable
varrbi<-sapply(rbigdp[,2:10],var,na.rm=TRUE)
varrbi
##   Agriculture        Mining Manufacturing   Electricity  Construction 
##    68607.5911      650.8851    50793.2583      586.6067    20196.6790 
##         Trade       Finance     Community         Gross 
##   310981.5413    67357.0104    48609.1615  2506873.7078
#variances of the first three series  of the dataset are
varrbi[1:3]
##   Agriculture        Mining Manufacturing 
##    68607.5911      650.8851    50793.2583

3.Correlation between various sector-wise gdp series

#correlation matrix with values truncated to 2 decimals
corrbi<-cor(rbigdp[,2:10])
corrbi<-round(as.matrix(corrbi),2)
corrbi
##               Agriculture Mining Manufacturing Electricity Construction
## Agriculture          1.00   0.60          0.47        0.45         0.47
## Mining               0.60   1.00          0.95        0.94         0.95
## Manufacturing        0.47   0.95          1.00        0.98         0.99
## Electricity          0.45   0.94          0.98        1.00         0.99
## Construction         0.47   0.95          0.99        0.99         1.00
## Trade                0.52   0.97          0.99        0.98         0.99
## Finance              0.47   0.95          0.99        0.99         0.99
## Community            0.43   0.94          0.93        0.92         0.92
## Gross                0.61   0.98          0.98        0.97         0.98
##               Trade Finance Community Gross
## Agriculture    0.52    0.47      0.43  0.61
## Mining         0.97    0.95      0.94  0.98
## Manufacturing  0.99    0.99      0.93  0.98
## Electricity    0.98    0.99      0.92  0.97
## Construction   0.99    0.99      0.92  0.98
## Trade          1.00    0.99      0.93  0.99
## Finance        0.99    1.00      0.93  0.98
## Community      0.93    0.93      1.00  0.93
## Gross          0.99    0.98      0.93  1.00
#plotting the correlation matrix
library(corrplot)
corrplot(corrbi, method="number")

corrplot(corrbi, method="circle")

############################

#custom function

## the following code and figure is adapted from the help file for pairs
## put (absolute) correlations on the upper panels,
## with size proportional to the correlations.

#first create a function (panel.cor)

panel.cor <- function(x, y, digits=2, prefix="", cex.cor){
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r = (cor(x, y))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex * abs(r))
}

# now use the function for the given data. 
pairs(rbigdp[,2:10], lower.panel=panel.smooth, upper.panel=panel.cor)

4.Plotting GDP & super-imposing trend of agriculture in the same graph

#Before plotting the time series, lets plot the histogram with superimposed density plot of the GDP & Agriculture variables

par(mfrow=c(1,2)) #for plotting both the histograms side by side

#histogram with density curve of the 'agriculture' variable
hist(rbigdp$Agriculture,col="steelblue",xlab="agriculture",freq = F)
lines(density(rbigdp$Agriculture),col="violetred",lwd=3)

#histogram with density curve of the 'Gross/GDP' variable
hist(rbigdp$Gross,col="firebrick",xlab="Gross/GDP",freq = F)
lines(density(rbigdp$Gross),col="blue4",lwd=3)

#########################

#scatter plot matrix with the histograms for all the variables
#This plots the histogram across the diagonals and scatterplot with the regression lines in other cells
par(mfrow=c(1,1))
library(car)
scatterplotMatrix(rbigdp[,2:10],diagonal = "histogram",smooth = FALSE)

#############################

#Now we will plot the timeseries of GDP and Agriculture combined in a single plot

#setting the original margins for the plot window
par(mar=c(5,5,4,1))
par(mfrow=c(1,1))

#plotting the Gross/GDP series over time
plot(rbigdp$Gross,type="l",lwd=4,col="blue",xlab="no of quarterly periods",ylab="Gross/GDP & Agriculture",main="Plot of GDP & Agriculture over time",ylim=c(0,10000))

#superimposing the Agriculture timeseries on the above plot
lines(rbigdp$Agriculture,col="red",lwd=4)

#overlaying the points on the graph
points(rbigdp$Gross,pch=17)
points(rbigdp$Agriculture,pch=17)

#Adding text to the graph
text(20,6500,"GDP",col="blue",cex=2)
text(35,2800,"Agriculture",col="red",cex=2)

see the analysis/document in your browser by clicking at the link: Assignment1_CAEA_SavitaKulkarni_R

      #########################################################