1.Replace

setwd("/Users/vancam/Documents/WAIKATO-Thesis/Rworking/Wooldridge/StataFile")
rm(list = ls())
library(foreign)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
charity <- read.dta("CHARITY.DTA")
#1.1Recode: For example replace every value from 0-5 into 1, from 6-7 into 2
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
charity$gift <- recode(charity$gift, "1:5=1")#Every value from 1 to 5 will be turn into 1
charity$gift <- recode(charity$gift, "0=100")#Every value equal to 0 will be turn into 100
charity$gift <- recode(charity$gift, "6:10=99")#Every value from 6 to 10 will turn into 99
#1.2. Recode one by one, some together
charity$gift <- recode(charity$gift,"6=7;8=9;10=11")

#1.3. Recode some value into NA
charity$gift<-recode(charity$gift,"0=NA") # every value equal to 3 will turn into "NA"

#1.4. Recode some value as number into characters of string
charity$gift <- recode(charity$gift, "1='one'")#Recode from one number into string/character
charity$giftlast <- recode(charity$last, "1:5='low'; 6:10='middle';else = 'high'")#Recode from a set of number to string/character

#1.5. Recode all the value in a data frame (not only the value in one variable)
# Recode 1 to 777 wherever 1 appears in the practice data frame
# Recode 0 to 888 wherever 0 appears in the practice data frame
charity <- apply(charity, 6, function(x) {x <- recode(x,"1=777; 0=888"); x}, na.rm=TRUE) # 6 is the number of variables or number of value to replace? How to make it consider about missing value
## Error in if (d2 == 0L) {: missing value where TRUE/FALSE needed
#Another way for replacing some value into another value
charity$gift[charity$gift==2] <- 4 # Replace every value equals to 2 in the variable "gift" into 4 

sum(charity$gift == 4, na.rm = TRUE)
## [1] 0

2. Add variable and some calculate with new created variable

#2.1. Create a new binary variable
charity$giftlastdum <- factor (with(charity, ifelse((giftlast == 1),0,1)))# Generate a new dummy variable, which recive 0 if giftlast == 1, recive 1 otherwise
charity$dum2 <- factor(with(charity, ifelse((giftlast == 1), "yes", "no")))# Create a new dummy variable, which recieve "yes" if giftlast == 1, "no" otherwise

#2.2. Create a new variable with condition
charity <- mutate(charity, highlow=ifelse(gift>10, "high", "low")) #Create a new variable name "highlow", which is "high" if gift>10 and "low" otherwise.

#2.3. Create a new variable equal to some calculation
charity <- mutate(charity, weekslast1=weekslast/100)
charity <- mutate(charity, weekslast2=weekslast1-weekslast)
charity <- mutate(charity, weekslast3=weekslast1*weekslast2)
charity <- mutate(charity, weeklast4=weekslast1+weekslast3+9)
charity <- mutate(charity, weekslast5=weekslast+20)

#2.4. Create a new variable with random value
set.seed(2612)
nobs <- 4268
min <- 0
max <- 100
xi <- runif(nobs, min = 0, max = 100)
charity <- mutate(charity, random =xi)  #Create a new variable name "random" with the values created at the above steeps in this chunk

3. Change variable’s names

charity <- rename(charity, res=respond, gf=gift)# Change the variable name from "respond" to "res" and from "gift" to "gf"

4. Change variable’s type

charity$weekslast1<- as.integer(charity$weekslast) # change the variable "weeklast" into integer number in "weekslast1"
charity$gift <- as.numeric(charity$gift)
## Error in `$<-.data.frame`(`*tmp*`, "gift", value = numeric(0)): replacement has 0 rows, data has 4268
charity$gift <- as.numeric(gsub("01", "1", charity$gift)) # Change the value of one variable from string into number
## Error in `$<-.data.frame`(`*tmp*`, "gift", value = numeric(0)): replacement has 0 rows, data has 4268

5. Checking the statistics of variables

summary(charity$respond)# Find the statistics of one variable
## Length  Class   Mode 
##      0   NULL   NULL
summary(charity) # Find the statistics of all variables in the sample.
##       res           gf               resplast        weekslast     
##  Min.   :0.0   Length:4268        Min.   :0.0000   Min.   : 13.14  
##  1st Qu.:0.0   Class :character   1st Qu.:0.0000   1st Qu.: 26.14  
##  Median :0.0   Mode  :character   Median :0.0000   Median : 51.29  
##  Mean   :0.4                      Mean   :0.3348   Mean   : 59.05  
##  3rd Qu.:1.0                      3rd Qu.:1.0000   3rd Qu.:103.86  
##  Max.   :1.0                      Max.   :1.0000   Max.   :195.00  
##     propresp         mailsyear      giftlast            avggift       
##  Min.   :0.09091   Min.   :0.25   Length:4268        Min.   :   1.00  
##  1st Qu.:0.28571   1st Qu.:1.75   Class :character   1st Qu.:  10.00  
##  Median :0.44444   Median :2.00   Mode  :character   Median :  10.00  
##  Mean   :0.48436   Mean   :2.05                      Mean   :  18.24  
##  3rd Qu.:0.66667   3rd Qu.:2.50                      3rd Qu.:  23.33  
##  Max.   :1.00000   Max.   :3.50                      Max.   :5005.00  
##  giftlastdum dum2        highlow            weekslast1    
##  1:4268      no:4268   Length:4268        Min.   : 13.00  
##                        Class :character   1st Qu.: 26.00  
##                        Mode  :character   Median : 51.00  
##                                           Mean   : 58.78  
##                                           3rd Qu.:103.00  
##                                           Max.   :195.00  
##    weekslast2        weekslast3         weeklast4          weekslast5    
##  Min.   :-193.05   Min.   :-376.447   Min.   :-365.498   Min.   : 33.14  
##  1st Qu.:-102.82   1st Qu.:-106.784   1st Qu.: -96.746   1st Qu.: 46.14  
##  Median : -50.77   Median : -26.039   Median : -16.526   Median : 71.29  
##  Mean   : -58.46   Mean   : -53.963   Mean   : -44.373   Mean   : 79.05  
##  3rd Qu.: -25.88   3rd Qu.:  -6.766   3rd Qu.:   2.495   3rd Qu.:123.86  
##  Max.   : -13.01   Max.   :  -1.710   Max.   :   7.421   Max.   :215.00  
##      random        
##  Min.   : 0.02806  
##  1st Qu.:25.47671  
##  Median :50.16408  
##  Mean   :50.17442  
##  3rd Qu.:74.98432  
##  Max.   :99.98975
##summary(charity, c=(respond, gift))### Not done

sd(charity$gift) #checking the standard deviation of the varibale
## [1] NA
var(charity$gift)#Checking the variance of the variable
## Error in var(charity$gift): 'x' is NULL
hist(charity$resplast)#Histogram graph

plot(charity$resplast)# Scater plot graph

plot(charity$weekslast)

6. Checking data

# Checking the name of the variables
names(charity)
##  [1] "res"         "gf"          "resplast"    "weekslast"   "propresp"   
##  [6] "mailsyear"   "giftlast"    "avggift"     "giftlastdum" "dum2"       
## [11] "highlow"     "weekslast1"  "weekslast2"  "weekslast3"  "weeklast4"  
## [16] "weekslast5"  "random"
#Checking the missing value
sum(is.na(charity$gift))
## Warning in is.na(charity$gift): is.na() applied to non-(list or vector) of
## type 'NULL'
## [1] 0
load("/Users/vancam/Documents/WAIKATO-Thesis/VEScore/Vandata/export11.rda")
sum(is.na(export11$exportval))#It reports the number of missing value in each selected column
## [1] 23370
summary(export11) # Report the number of missing value at the end of the reported list
##     province         fcode           taxcode          industrialzone 
##  Min.   : 1.00   Min.   :      1   Length:53965       Min.   :1.000  
##  1st Qu.:27.00   1st Qu.:   4310   Class :character   1st Qu.:2.000  
##  Median :72.00   Median :   8775   Mode  :character   Median :2.000  
##  Mean   :52.52   Mean   : 182734                      Mean   :1.892  
##  3rd Qu.:79.00   3rd Qu.:  65234                      3rd Qu.:2.000  
##  Max.   :98.00   Max.   :1091031                      Max.   :2.000  
##                                                       NA's   :1131   
##      ftype           export       exportval           importvol        
##  Min.   : 1.00   Min.   :0.00   Min.   :0.000e+00   Min.   :0.000e+00  
##  1st Qu.: 9.00   1st Qu.:2.00   1st Qu.:0.000e+00   1st Qu.:0.000e+00  
##  Median : 9.00   Median :2.00   Median :0.000e+00   Median :0.000e+00  
##  Mean   : 9.05   Mean   :1.81   Mean   :2.670e+06   Mean   :2.167e+06  
##  3rd Qu.:10.00   3rd Qu.:2.00   3rd Qu.:0.000e+00   3rd Qu.:0.000e+00  
##  Max.   :14.00   Max.   :2.00   Max.   :9.659e+09   Max.   :5.842e+09  
##                  NA's   :1129   NA's   :23370       NA's   :23605      
##   industrycode       labor             capital             sales          
##  Min.   :10101   Min.   :    0.00   Min.   :       0   Min.   :        0  
##  1st Qu.:14100   1st Qu.:    5.00   1st Qu.:     884   1st Qu.:      524  
##  Median :20231   Median :   12.00   Median :    2827   Median :     2630  
##  Mean   :20383   Mean   :   87.23   Mean   :   43856   Mean   :    63127  
##  3rd Qu.:25910   3rd Qu.:   40.00   3rd Qu.:   12307   3rd Qu.:    13648  
##  Max.   :33200   Max.   :79909.00   Max.   :71438869   Max.   :127711298  
##                  NA's   :1331       NA's   :2871       NA's   :957
#Checking which value is missing
#which(is.na(export11$sales))# Hide the results

7. Treating with missing data

#Create a new data frame with missing value
vandf <- data.frame(c1=1:8, c2=8:15)#Create a new data frame
vandf[4,1] <- vandf[6,2] <- NA # Create some missing value. This command request R to create the 4th value of column 1 and the 6th value of column 2 tobe missing value

#1. Excluding the row which contains missing value
na.exclude(vandf$c1)
## [1] 1 2 3 5 6 7 8
## attr(,"na.action")
## [1] 4
## attr(,"class")
## [1] "exclude"
na.exclude(vandf$c2) # Exclude the missing value from the data
## [1]  8  9 10 11 12 14 15
## attr(,"na.action")
## [1] 6
## attr(,"class")
## [1] "exclude"
#2. Replace missing value with the mean of non-missing value

#3.Replace missing value with mean, meadian and a specific value
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
vandf <- data.frame(c1=1:8, c2=8:15)#Create a new data frame
vandf[4,1] <- vandf[6,2] <- NA

vandf$c1 <- impute(vandf$c1, mean) # Replace the missing value with the mean of the variable
charity$gift <- impute(charity$gift, mean)
## Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
impute(vandf$c2, median) #Replace the missing value with the median of the variable
##   1   2   3   4   5   6   7   8 
##   8   9  10  11  12 11*  14  15
impute(vandf$c1, 20) #Replace the missing value with a specific value
## [1] 1.000000  2.000000  3.000000  4.571429* 5.000000  6.000000  7.000000 
## [8] 8.000000

8. Label the variable

library(Hmisc)
label(charity$gift) <- "This is the amount of money giving to the charity each year"
## Error in attr(x, "label") <- value: attempt to set an attribute on NULL
charity$gift
## NULL
str(charity)
## 'data.frame':    4268 obs. of  17 variables:
##  $ res        : int  0 0 0 0 1 0 0 0 0 1 ...
##  $ gf         : chr  "100" "100" "100" "100" ...
##  $ resplast   : int  0 0 1 0 0 0 0 0 1 0 ...
##  $ weekslast  : num  143 65.4 13.1 120.1 103.9 ...
##  $ propresp   : num  0.3 0.3 0.3 0.3 0.2 ...
##  $ mailsyear  : num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ giftlast   : chr  "high" "high" "high" "high" ...
##  $ avggift    : num  10 10 10 10 10 10 10 6 10 5 ...
##  $ giftlastdum: Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ dum2       : Factor w/ 1 level "no": 1 1 1 1 1 1 1 1 1 1 ...
##  $ highlow    : chr  "high" "high" "high" "high" ...
##  $ weekslast1 : int  143 65 13 120 103 129 143 103 13 51 ...
##  $ weekslast2 : num  -141.6 -64.8 -13 -118.9 -102.8 ...
##  $ weekslast3 : num  -202.45 -42.38 -1.71 -142.9 -106.78 ...
##  $ weeklast4  : num  -192.02 -32.73 7.42 -132.7 -96.75 ...
##  $ weekslast5 : num  163 85.4 33.1 140.1 123.9 ...
##  $ random     : num  90.3 14.6 66.8 42.1 42 ...

9. Some advance statistics codes

library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
#Summary some main statistic together
charity %>% summarise_each(funs(mean, median, max, min, sd, n()), gift)
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`
## Error in overscope_eval_next(overscope, expr): object 'gift' not found
##Summary statistics - advanced
library(fBasics)
## Loading required package: timeDate
## Loading required package: timeSeries
## 
## Rmetrics Package fBasics
## Analysing Markets and calculating Basic Statistics
## Copyright (C) 2005-2014 Rmetrics Association Zurich
## Educational Software for Financial Engineering and Computational Science
## Rmetrics is free software and comes with ABSOLUTELY NO WARRANTY.
## https://www.rmetrics.org --- Mail to: info@rmetrics.org
## 
## Attaching package: 'fBasics'
## The following object is masked from 'package:car':
## 
##     densityPlot
basicStats(charity)
## Error in (1 - h) * qs[i]: non-numeric argument to binary operator
# Summary of more than 1 variable in one code
summary(charity[, c("gift", "resplast")])
## Error in `[.data.frame`(charity, , c("gift", "resplast")): undefined columns selected
# Check the 10 first/last values
head(charity, n = 10)
##    res  gf resplast weekslast propresp mailsyear giftlast avggift
## 1    0 100        0 143.00000      0.3       2.5     high      10
## 2    0 100        0  65.42857      0.3       2.5     high      10
## 3    0 100        1  13.14286      0.3       2.5     high      10
## 4    0 100        0 120.14286      0.3       2.5     high      10
## 5    1  99        0 103.85714      0.2       2.5     high      10
## 6    0 100        0 129.14285      0.3       2.5     high      10
## 7    0 100        0 143.00000      0.3       2.5     high      10
## 8    0 100        0 103.85714      0.5       2.5     high       6
## 9    0 100        1  13.14286      0.3       2.5     high      10
## 10   1 one        0  51.28571      0.5       2.5     high       5
##    giftlastdum dum2 highlow weekslast1 weekslast2  weekslast3   weeklast4
## 1            1   no    high        143 -141.57000 -202.445100 -192.015100
## 2            1   no    high         65  -64.77429  -42.380893  -32.726607
## 3            1   no    high         13  -13.01143   -1.710074    7.421355
## 4            1   no    high        120 -118.94143 -142.899638 -132.698210
## 5            1   no    high        103 -102.81857 -106.784424  -96.745852
## 6            1   no    high        129 -127.85142 -165.110977 -154.819548
## 7            1   no    high        143 -141.57000 -202.445100 -192.015100
## 8            1   no    high        103 -102.81857 -106.784424  -96.745852
## 9            1   no    high         13  -13.01143   -1.710074    7.421355
## 10           1   no    high         51  -50.77286  -26.039221  -16.526364
##    weekslast5    random
## 1   163.00000 90.290620
## 2    85.42857 14.556989
## 3    33.14286 66.835776
## 4   140.14286 42.099129
## 5   123.85714 41.984395
## 6   149.14285  1.513396
## 7   163.00000 92.764559
## 8   123.85714 24.556847
## 9    33.14286  7.273638
## 10   71.28571 56.735000
tail(charity, n = 10)
##      res  gf resplast weekslast propresp mailsyear giftlast avggift
## 4259   0 100        1  51.28571 1.000000      0.50     high      30
## 4260   1  99        1  51.28571 0.800000      1.25     high      10
## 4261   0 100        0  51.28571 0.333333      0.75     high      50
## 4262   1  25        0  51.28571 0.333333      0.75     high      25
## 4263   0 100        0  51.28571 0.333333      0.75     high      50
## 4264   1  99        1  13.14286 1.000000      0.75     high      15
## 4265   1  25        1  13.14286 0.666667      0.75     high      25
## 4266   1  25        0  51.28571 0.333333      0.75     high      25
## 4267   0 100        0  51.28571 0.333333      0.75     high      50
## 4268   1  25        0  51.28571 0.333333      0.75     high      25
##      giftlastdum dum2 highlow weekslast1 weekslast2 weekslast3  weeklast4
## 4259           1   no    high         51  -50.77286 -26.039221 -16.526364
## 4260           1   no    high         51  -50.77286 -26.039221 -16.526364
## 4261           1   no    high         51  -50.77286 -26.039221 -16.526364
## 4262           1   no    high         51  -50.77286 -26.039221 -16.526364
## 4263           1   no    high         51  -50.77286 -26.039221 -16.526364
## 4264           1   no    high         13  -13.01143  -1.710074   7.421355
## 4265           1   no    high         13  -13.01143  -1.710074   7.421355
## 4266           1   no    high         51  -50.77286 -26.039221 -16.526364
## 4267           1   no    high         51  -50.77286 -26.039221 -16.526364
## 4268           1   no    high         51  -50.77286 -26.039221 -16.526364
##      weekslast5    random
## 4259   71.28571  0.335298
## 4260   71.28571 66.769744
## 4261   71.28571 44.764990
## 4262   71.28571 20.877626
## 4263   71.28571 14.409151
## 4264   33.14286 38.035370
## 4265   33.14286 88.917001
## 4266   71.28571 90.931567
## 4267   71.28571  5.500989
## 4268   71.28571 64.568571