1 SetUp

Clear everything.

# Clear the work space
rm(list = ls()) # Clear environment
gc()            # Clear unused memory
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 525253 28.1    1166077 62.3         NA   669291 35.8
## Vcells 964700  7.4    8388608 64.0      32768  1840369 14.1
cat("\f")       # Clear the console
#dev.off()       # Clear the charts
 
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.32   R6_2.5.1        jsonlite_1.8.7  evaluate_0.21  
##  [5] cachem_1.0.8    rlang_1.1.1     cli_3.6.1       rstudioapi_0.14
##  [9] jquerylib_0.1.4 bslib_0.5.0     rmarkdown_2.21  tools_4.2.1    
## [13] xfun_0.39       yaml_2.3.7      fastmap_1.1.1   compiler_4.2.1 
## [17] htmltools_0.5.5 knitr_1.42      sass_0.4.6
library(visdat)
library(vtable)
## Loading required package: kableExtra
library(gtools)

2 Import

2.1 mydata_1960_1980

df1 <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Summer/Data Analysis/share/Day 9/mydata_1960_1980.csv")

vis_dat(df1)

??st
st(df1)
Summary Statistics
Variable N Mean Std. Dev. Min Pctl. 25 Pctl. 75 Max
year 2850 1974 8.7 1960 1967 1982 1989
gdppc 2065 9256 10856 179 1695 15298 81947
unemp 448 7.9 4.8 0.6 4.1 10 28
export 1740 31925198701 60411559532 126529127 2789683321 30484781664 556004000000
import 1750 32304559329 64205761728 198918483 3778882830 29240323123 653068000000
table(df1$year)
## 
## 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 
##   95   95   95   95   95   95   95   95   95   95   95   95   95   95   95   95 
## 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 
##   95   95   95   95   95   95   95   95   95   95   95   95   95   95

2.2 mydata_1990_2020

df2 <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Summer/Data Analysis/share/Day 9/mydata_1990_2020.csv")

vis_dat(df2)

??st
st(df2)
Summary Statistics
Variable N Mean Std. Dev. Min Pctl. 25 Pctl. 75 Max
year 2280 2002 6.9 1990 1996 2007 2013
Gdppc 2254 13746 15846 198 2486 22950 87717
unemp 2121 9.1 6.2 0.6 4.8 11 52
export 2209 102121255849 203883942176 223329024 5575464298 104000000000 1910000000000
Import 2209 101990170968 223469497481 312413409 6518361074 90823243497 2280000000000
table(df2$year)
## 
## 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 
##   95   95   95   95   95   95   95   95   95   95   95   95   95   95   95   95 
## 2006 2007 2008 2009 2010 2011 2012 2013 
##   95   95   95   95   95   95   95   95

3 Append the data

Put one data on top of the other.

?rbind      # gives an error
# rbind(df1, df2)

?smartbind  # brute force append
appended_dataset <- smartbind(df1, df2)

vis_dat(appended_dataset[,c(1:2, 3,6:8)])

names(df2)[names(df2) == 'Gdppc']  <- 'gdppc'
names(df2)[names(df2) == 'Import'] <- 'import'

appended_dataset <- rbind(df1, df2)
vis_dat(appended_dataset)

4 MERGING

4.1 Import the using data

df3 <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Summer/Data Analysis/share/Day 9/mydata_pol.csv")
vis_dat(df3)

4.2 Unique Identifier

appended_data is a panel data

df3 is a panel data.

country, year together uniquely identify each row.

names(appended_dataset)
## [1] "year"    "country" "gdppc"   "unemp"   "export"  "import"
names(df3)
## [1] "year"     "country"  "politics"
# remove(merged_data)
?merge

merged_data <- 
merge(x     = appended_dataset,
      y     = df3, 
      by.x    = c("country", "year"), # unique identifier name in x data
      by.y    = c("country", "year"), # unique identifier name in y data
      all.x = TRUE
      )

vis_dat(merged_data) 

7 columns make sense.

  • country and year together uniquely identify each row.

  • gdpcc, unemp, export and import from master data, while politics (polictical score from using data

5 Export Data

?write.csv
getwd() # print my working directory
## [1] "/Users/arvindsharma/Library/CloudStorage/Dropbox/WCAS/Summer/Data Analysis/share/Day 9"
write.csv(x         =  merged_data,  # data frame 
          file      = "exported_data.csv", 
          row.names = FALSE          # row numbers not printed out as a column
          )