1.Replace
setwd("/Users/vancam/Documents/WAIKATO-Thesis/Rworking/Wooldridge/StataFile")
rm(list = ls())
library(foreign)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
charity <- read.dta("CHARITY.DTA")
#1.1Recode: For example replace every value from 0-5 into 1, from 6-7 into 2
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
charity$gift <- recode(charity$gift, "1:5=1")#Every value from 1 to 5 will be turn into 1
charity$gift <- recode(charity$gift, "0=100")#Every value equal to 0 will be turn into 100
charity$gift <- recode(charity$gift, "6:10=99")#Every value from 6 to 10 will turn into 99
#1.2. Recode one by one, some together
charity$gift <- recode(charity$gift,"6=7;8=9;10=11")
#1.3. Recode some value into NA
charity$gift<-recode(charity$gift,"0=NA") # every value equal to 3 will turn into "NA"
#1.4. Recode some value as number into characters of string
charity$gift <- recode(charity$gift, "1='one'")#Recode from one number into string/character
charity$giftlast <- recode(charity$last, "1:5='low'; 6:10='middle';else = 'high'")#Recode from a set of number to string/character
#1.5. Recode all the value in a data frame (not only the value in one variable)
# Recode 1 to 777 wherever 1 appears in the practice data frame
# Recode 0 to 888 wherever 0 appears in the practice data frame
charity <- apply(charity, 6, function(x) {x <- recode(x,"1=777; 0=888"); x}, na.rm=TRUE) # 6 is the number of variables or number of value to replace? How to make it consider about missing value
## Error in if (d2 == 0L) {: missing value where TRUE/FALSE needed
#Another way for replacing some value into another value
charity$gift[charity$gift==2] <- 4 # Replace every value equals to 2 in the variable "gift" into 4
sum(charity$gift == 4, na.rm = TRUE)
## [1] 0
2. Add variable and some calculate with new created variable
#2.1. Create a new binary variable
charity$giftlastdum <- factor (with(charity, ifelse((giftlast == 1),0,1)))# Generate a new dummy variable, which recive 0 if giftlast == 1, recive 1 otherwise
charity$dum2 <- factor(with(charity, ifelse((giftlast == 1), "yes", "no")))# Create a new dummy variable, which recieve "yes" if giftlast == 1, "no" otherwise
#2.2. Create a new variable with condition
charity <- mutate(charity, highlow=ifelse(gift>10, "high", "low")) #Create a new variable name "highlow", which is "high" if gift>10 and "low" otherwise.
#2.3. Create a new variable equal to some calculation
charity <- mutate(charity, weekslast1=weekslast/100)
charity <- mutate(charity, weekslast2=weekslast1-weekslast)
charity <- mutate(charity, weekslast3=weekslast1*weekslast2)
charity <- mutate(charity, weeklast4=weekslast1+weekslast3+9)
charity <- mutate(charity, weekslast5=weekslast+20)
#2.4. Create a new variable with random value
set.seed(2612)
nobs <- 4268
min <- 0
max <- 100
xi <- runif(nobs, min = 0, max = 100)
charity <- mutate(charity, random =xi) #Create a new variable name "random" with the values created at the above steeps in this chunk
3. Change variable’s names
charity <- rename(charity, res=respond, gf=gift)# Change the variable name from "respond" to "res" and from "gift" to "gf"
4. Change variable’s type
charity$weekslast1<- as.integer(charity$weekslast) # change the variable "weeklast" into integer number in "weekslast1"
charity$gift <- as.numeric(charity$gift)
## Error in `$<-.data.frame`(`*tmp*`, "gift", value = numeric(0)): replacement has 0 rows, data has 4268
charity$gift <- as.numeric(gsub("01", "1", charity$gift)) # Change the value of one variable from string into number
## Error in `$<-.data.frame`(`*tmp*`, "gift", value = numeric(0)): replacement has 0 rows, data has 4268
5. Checking the statistics of variables
summary(charity$respond)# Find the statistics of one variable
## Length Class Mode
## 0 NULL NULL
summary(charity) # Find the statistics of all variables in the sample.
## res gf resplast weekslast
## Min. :0.0 Length:4268 Min. :0.0000 Min. : 13.14
## 1st Qu.:0.0 Class :character 1st Qu.:0.0000 1st Qu.: 26.14
## Median :0.0 Mode :character Median :0.0000 Median : 51.29
## Mean :0.4 Mean :0.3348 Mean : 59.05
## 3rd Qu.:1.0 3rd Qu.:1.0000 3rd Qu.:103.86
## Max. :1.0 Max. :1.0000 Max. :195.00
## propresp mailsyear giftlast avggift
## Min. :0.09091 Min. :0.25 Length:4268 Min. : 1.00
## 1st Qu.:0.28571 1st Qu.:1.75 Class :character 1st Qu.: 10.00
## Median :0.44444 Median :2.00 Mode :character Median : 10.00
## Mean :0.48436 Mean :2.05 Mean : 18.24
## 3rd Qu.:0.66667 3rd Qu.:2.50 3rd Qu.: 23.33
## Max. :1.00000 Max. :3.50 Max. :5005.00
## giftlastdum dum2 highlow weekslast1
## 1:4268 no:4268 Length:4268 Min. : 13.00
## Class :character 1st Qu.: 26.00
## Mode :character Median : 51.00
## Mean : 58.78
## 3rd Qu.:103.00
## Max. :195.00
## weekslast2 weekslast3 weeklast4 weekslast5
## Min. :-193.05 Min. :-376.447 Min. :-365.498 Min. : 33.14
## 1st Qu.:-102.82 1st Qu.:-106.784 1st Qu.: -96.746 1st Qu.: 46.14
## Median : -50.77 Median : -26.039 Median : -16.526 Median : 71.29
## Mean : -58.46 Mean : -53.963 Mean : -44.373 Mean : 79.05
## 3rd Qu.: -25.88 3rd Qu.: -6.766 3rd Qu.: 2.495 3rd Qu.:123.86
## Max. : -13.01 Max. : -1.710 Max. : 7.421 Max. :215.00
## random
## Min. : 0.02806
## 1st Qu.:25.47671
## Median :50.16408
## Mean :50.17442
## 3rd Qu.:74.98432
## Max. :99.98975
##summary(charity, c=(respond, gift))### Not done
sd(charity$gift) #checking the standard deviation of the varibale
## [1] NA
var(charity$gift)#Checking the variance of the variable
## Error in var(charity$gift): 'x' is NULL
hist(charity$resplast)#Histogram graph

plot(charity$resplast)# Scater plot graph

plot(charity$weekslast)

6. Checking data
# Checking the name of the variables
names(charity)
## [1] "res" "gf" "resplast" "weekslast" "propresp"
## [6] "mailsyear" "giftlast" "avggift" "giftlastdum" "dum2"
## [11] "highlow" "weekslast1" "weekslast2" "weekslast3" "weeklast4"
## [16] "weekslast5" "random"
#Checking the missing value
sum(is.na(charity$gift))
## Warning in is.na(charity$gift): is.na() applied to non-(list or vector) of
## type 'NULL'
## [1] 0
load("/Users/vancam/Documents/WAIKATO-Thesis/VEScore/Vandata/export11.rda")
sum(is.na(export11$exportval))#It reports the number of missing value in each selected column
## [1] 23370
summary(export11) # Report the number of missing value at the end of the reported list
## province fcode taxcode industrialzone
## Min. : 1.00 Min. : 1 Length:53965 Min. :1.000
## 1st Qu.:27.00 1st Qu.: 4310 Class :character 1st Qu.:2.000
## Median :72.00 Median : 8775 Mode :character Median :2.000
## Mean :52.52 Mean : 182734 Mean :1.892
## 3rd Qu.:79.00 3rd Qu.: 65234 3rd Qu.:2.000
## Max. :98.00 Max. :1091031 Max. :2.000
## NA's :1131
## ftype export exportval importvol
## Min. : 1.00 Min. :0.00 Min. :0.000e+00 Min. :0.000e+00
## 1st Qu.: 9.00 1st Qu.:2.00 1st Qu.:0.000e+00 1st Qu.:0.000e+00
## Median : 9.00 Median :2.00 Median :0.000e+00 Median :0.000e+00
## Mean : 9.05 Mean :1.81 Mean :2.670e+06 Mean :2.167e+06
## 3rd Qu.:10.00 3rd Qu.:2.00 3rd Qu.:0.000e+00 3rd Qu.:0.000e+00
## Max. :14.00 Max. :2.00 Max. :9.659e+09 Max. :5.842e+09
## NA's :1129 NA's :23370 NA's :23605
## industrycode labor capital sales
## Min. :10101 Min. : 0.00 Min. : 0 Min. : 0
## 1st Qu.:14100 1st Qu.: 5.00 1st Qu.: 884 1st Qu.: 524
## Median :20231 Median : 12.00 Median : 2827 Median : 2630
## Mean :20383 Mean : 87.23 Mean : 43856 Mean : 63127
## 3rd Qu.:25910 3rd Qu.: 40.00 3rd Qu.: 12307 3rd Qu.: 13648
## Max. :33200 Max. :79909.00 Max. :71438869 Max. :127711298
## NA's :1331 NA's :2871 NA's :957
#Checking which value is missing
#which(is.na(export11$sales))# Hide the results
7. Treating with missing data
#Create a new data frame with missing value
vandf <- data.frame(c1=1:8, c2=8:15)#Create a new data frame
vandf[4,1] <- vandf[6,2] <- NA # Create some missing value. This command request R to create the 4th value of column 1 and the 6th value of column 2 tobe missing value
#1. Excluding the row which contains missing value
na.exclude(vandf$c1)
## [1] 1 2 3 5 6 7 8
## attr(,"na.action")
## [1] 4
## attr(,"class")
## [1] "exclude"
na.exclude(vandf$c2) # Exclude the missing value from the data
## [1] 8 9 10 11 12 14 15
## attr(,"na.action")
## [1] 6
## attr(,"class")
## [1] "exclude"
#2. Replace missing value with the mean of non-missing value
#3.Replace missing value with mean, meadian and a specific value
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
vandf <- data.frame(c1=1:8, c2=8:15)#Create a new data frame
vandf[4,1] <- vandf[6,2] <- NA
vandf$c1 <- impute(vandf$c1, mean) # Replace the missing value with the mean of the variable
charity$gift <- impute(charity$gift, mean)
## Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
impute(vandf$c2, median) #Replace the missing value with the median of the variable
## 1 2 3 4 5 6 7 8
## 8 9 10 11 12 11* 14 15
impute(vandf$c1, 20) #Replace the missing value with a specific value
## [1] 1.000000 2.000000 3.000000 4.571429* 5.000000 6.000000 7.000000
## [8] 8.000000
8. Label the variable
library(Hmisc)
label(charity$gift) <- "This is the amount of money giving to the charity each year"
## Error in attr(x, "label") <- value: attempt to set an attribute on NULL
charity$gift
## NULL
str(charity)
## 'data.frame': 4268 obs. of 17 variables:
## $ res : int 0 0 0 0 1 0 0 0 0 1 ...
## $ gf : chr "100" "100" "100" "100" ...
## $ resplast : int 0 0 1 0 0 0 0 0 1 0 ...
## $ weekslast : num 143 65.4 13.1 120.1 103.9 ...
## $ propresp : num 0.3 0.3 0.3 0.3 0.2 ...
## $ mailsyear : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ giftlast : chr "high" "high" "high" "high" ...
## $ avggift : num 10 10 10 10 10 10 10 6 10 5 ...
## $ giftlastdum: Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
## $ dum2 : Factor w/ 1 level "no": 1 1 1 1 1 1 1 1 1 1 ...
## $ highlow : chr "high" "high" "high" "high" ...
## $ weekslast1 : int 143 65 13 120 103 129 143 103 13 51 ...
## $ weekslast2 : num -141.6 -64.8 -13 -118.9 -102.8 ...
## $ weekslast3 : num -202.45 -42.38 -1.71 -142.9 -106.78 ...
## $ weeklast4 : num -192.02 -32.73 7.42 -132.7 -96.75 ...
## $ weekslast5 : num 163 85.4 33.1 140.1 123.9 ...
## $ random : num 90.3 14.6 66.8 42.1 42 ...
9. Some advance statistics codes
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
#Summary some main statistic together
charity %>% summarise_each(funs(mean, median, max, min, sd, n()), gift)
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over a selection of variables, use `summarise_at()`
## Error in overscope_eval_next(overscope, expr): object 'gift' not found
##Summary statistics - advanced
library(fBasics)
## Loading required package: timeDate
## Loading required package: timeSeries
##
## Rmetrics Package fBasics
## Analysing Markets and calculating Basic Statistics
## Copyright (C) 2005-2014 Rmetrics Association Zurich
## Educational Software for Financial Engineering and Computational Science
## Rmetrics is free software and comes with ABSOLUTELY NO WARRANTY.
## https://www.rmetrics.org --- Mail to: info@rmetrics.org
##
## Attaching package: 'fBasics'
## The following object is masked from 'package:car':
##
## densityPlot
basicStats(charity)
## Error in (1 - h) * qs[i]: non-numeric argument to binary operator
# Summary of more than 1 variable in one code
summary(charity[, c("gift", "resplast")])
## Error in `[.data.frame`(charity, , c("gift", "resplast")): undefined columns selected
# Check the 10 first/last values
head(charity, n = 10)
## res gf resplast weekslast propresp mailsyear giftlast avggift
## 1 0 100 0 143.00000 0.3 2.5 high 10
## 2 0 100 0 65.42857 0.3 2.5 high 10
## 3 0 100 1 13.14286 0.3 2.5 high 10
## 4 0 100 0 120.14286 0.3 2.5 high 10
## 5 1 99 0 103.85714 0.2 2.5 high 10
## 6 0 100 0 129.14285 0.3 2.5 high 10
## 7 0 100 0 143.00000 0.3 2.5 high 10
## 8 0 100 0 103.85714 0.5 2.5 high 6
## 9 0 100 1 13.14286 0.3 2.5 high 10
## 10 1 one 0 51.28571 0.5 2.5 high 5
## giftlastdum dum2 highlow weekslast1 weekslast2 weekslast3 weeklast4
## 1 1 no high 143 -141.57000 -202.445100 -192.015100
## 2 1 no high 65 -64.77429 -42.380893 -32.726607
## 3 1 no high 13 -13.01143 -1.710074 7.421355
## 4 1 no high 120 -118.94143 -142.899638 -132.698210
## 5 1 no high 103 -102.81857 -106.784424 -96.745852
## 6 1 no high 129 -127.85142 -165.110977 -154.819548
## 7 1 no high 143 -141.57000 -202.445100 -192.015100
## 8 1 no high 103 -102.81857 -106.784424 -96.745852
## 9 1 no high 13 -13.01143 -1.710074 7.421355
## 10 1 no high 51 -50.77286 -26.039221 -16.526364
## weekslast5 random
## 1 163.00000 90.290620
## 2 85.42857 14.556989
## 3 33.14286 66.835776
## 4 140.14286 42.099129
## 5 123.85714 41.984395
## 6 149.14285 1.513396
## 7 163.00000 92.764559
## 8 123.85714 24.556847
## 9 33.14286 7.273638
## 10 71.28571 56.735000
tail(charity, n = 10)
## res gf resplast weekslast propresp mailsyear giftlast avggift
## 4259 0 100 1 51.28571 1.000000 0.50 high 30
## 4260 1 99 1 51.28571 0.800000 1.25 high 10
## 4261 0 100 0 51.28571 0.333333 0.75 high 50
## 4262 1 25 0 51.28571 0.333333 0.75 high 25
## 4263 0 100 0 51.28571 0.333333 0.75 high 50
## 4264 1 99 1 13.14286 1.000000 0.75 high 15
## 4265 1 25 1 13.14286 0.666667 0.75 high 25
## 4266 1 25 0 51.28571 0.333333 0.75 high 25
## 4267 0 100 0 51.28571 0.333333 0.75 high 50
## 4268 1 25 0 51.28571 0.333333 0.75 high 25
## giftlastdum dum2 highlow weekslast1 weekslast2 weekslast3 weeklast4
## 4259 1 no high 51 -50.77286 -26.039221 -16.526364
## 4260 1 no high 51 -50.77286 -26.039221 -16.526364
## 4261 1 no high 51 -50.77286 -26.039221 -16.526364
## 4262 1 no high 51 -50.77286 -26.039221 -16.526364
## 4263 1 no high 51 -50.77286 -26.039221 -16.526364
## 4264 1 no high 13 -13.01143 -1.710074 7.421355
## 4265 1 no high 13 -13.01143 -1.710074 7.421355
## 4266 1 no high 51 -50.77286 -26.039221 -16.526364
## 4267 1 no high 51 -50.77286 -26.039221 -16.526364
## 4268 1 no high 51 -50.77286 -26.039221 -16.526364
## weekslast5 random
## 4259 71.28571 0.335298
## 4260 71.28571 66.769744
## 4261 71.28571 44.764990
## 4262 71.28571 20.877626
## 4263 71.28571 14.409151
## 4264 33.14286 38.035370
## 4265 33.14286 88.917001
## 4266 71.28571 90.931567
## 4267 71.28571 5.500989
## 4268 71.28571 64.568571