## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
mxs <- apply(select(iris,-Species), 2, max, na.rm=TRUE)
mns <- apply(select(iris,-Species), 2, min, na.rm=TRUE)
iris.norm <- cbind(scale(select(iris,-Species), center=mns, scale=mxs-mns),
select(iris,Species))
summary(iris.norm)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.2222 1st Qu.:0.3333 1st Qu.:0.1017 1st Qu.:0.08333
## Median :0.4167 Median :0.4167 Median :0.5678 Median :0.50000
## Mean :0.4287 Mean :0.4406 Mean :0.4675 Mean :0.45806
## 3rd Qu.:0.5833 3rd Qu.:0.5417 3rd Qu.:0.6949 3rd Qu.:0.70833
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
library(Hmisc) # for cut2()
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
data(Boston, package="MASS") # loading the data
summary(Boston$age) # the numeric variable we are going to discretize
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.90 45.02 77.50 68.57 94.08 100.00
Boston$newAge <- cut(Boston$age,5)
table(Boston$newAge)
##
## (2.8,22.3] (22.3,41.7] (41.7,61.2] (61.2,80.6] (80.6,100]
## 45 71 70 81 239
Boston$newAge <- cut(Boston$age,5, # alternative using our own labels for the bins
labels=c("verynew","new","normal","old","veryold"))
table(Boston$newAge)
##
## verynew new normal old veryold
## 45 71 70 81 239
Boston$newAge <- cut2(Boston$age, g=5)
table(Boston$newAge)
##
## [ 2.9, 38.1) [38.1, 66.1) [66.1, 86.1) [86.1, 95.7) [95.7,100.0]
## 102 101 101 101 101
Boston$newAge <- cut2(Boston$age, g=5)
table(Boston$newAge)
##
## [ 2.9, 38.1) [38.1, 66.1) [66.1, 86.1) [86.1, 95.7) [95.7,100.0]
## 102 101 101 101 101
Boston$newAge <- factor(cut2(Boston$age, g=5),
labels=c("verynew","new","normal","old","veryold"))
table(Boston$newAge)
##
## verynew new normal old veryold
## 102 101 101 101 101
3.3.3 Creating variables
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
sp500 <- xts(c(1102.94,1104.49,1115.71,1118.31),
ymd(c("2010-02-25","2010-02-26","2010-03-01","2010-03-02"),
tz=Sys.getenv("TZ")))
sp500
## [,1]
## 2010-02-25 1102.94
## 2010-02-26 1104.49
## 2010-03-01 1115.71
## 2010-03-02 1118.31