What does the dataset loo like?
data("iris")
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
head(iris, 5)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
data("mtcars")
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
head(mtcars,5)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Study one numeric variable.
hist(iris$Sepal.Length, freq = F)
lines(density(iris$Sepal.Length), col = "red")
bwplot(mtcars$am ~ mtcars$mpg)
Study one categorical variable.
pie(table(mtcars$cyl))
Plot two vriables.
with(iris, plot(Sepal.Length, Sepal.Width))
xyplot(Sepal.Length ~ Sepal.Width | Species, data = iris)
How about three variables?
ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
geom_point()
Let’s plot four variables.
ggplot(mtcars, aes(mpg, wt, color = gear)) +
geom_point() +
facet_grid(~am)
Want to plot the whole dataset.
scatterplotMatrix(mtcars[1:6], spread = F, lty.smooth = 2,
main = "Scatter plot matirx")
Change the wide table to the long table (such as 4 variables to 2 variables).
ldf <- df %>% gather(variable, value, -Species, -Key)
head(ldf,5)
## Species Key variable value
## 1 setosa 1 Sepal.Length 5.1
## 2 setosa 2 Sepal.Length 4.9
## 3 setosa 3 Sepal.Length 4.7
## 4 setosa 4 Sepal.Length 4.6
## 5 setosa 5 Sepal.Length 5.0
Change the long table back to the wide table.
sdf <- ldf %>% spread(variable, value)
head(sdf,5)
## Species Key Petal.Length Petal.Width Sepal.Length Sepal.Width
## 1 setosa 1 1.4 0.2 5.1 3.5
## 2 setosa 2 1.4 0.2 4.9 3.0
## 3 setosa 3 1.3 0.2 4.7 3.2
## 4 setosa 4 1.5 0.2 4.6 3.1
## 5 setosa 5 1.4 0.2 5.0 3.6
Seperate one cloumn into two columns.
sedf <- df %>% separate(Sepal.Length, c("Var1", "Var2"))
head(sedf,5)
## Var1 Var2 Sepal.Width Petal.Length Petal.Width Species Key
## 1 5 1 3.5 1.4 0.2 setosa 1
## 2 4 9 3.0 1.4 0.2 setosa 2
## 3 4 7 3.2 1.3 0.2 setosa 3
## 4 4 6 3.1 1.5 0.2 setosa 4
## 5 5 <NA> 3.6 1.4 0.2 setosa 5
Unite two columns into one column.
udf <- sedf %>% unite(Sepal.Length, Var1, Var2, sep = ".")
head(udf,5)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species Key
## 1 5.1 3.5 1.4 0.2 setosa 1
## 2 4.9 3.0 1.4 0.2 setosa 2
## 3 4.7 3.2 1.3 0.2 setosa 3
## 4 4.6 3.1 1.5 0.2 setosa 4
## 5 5.NA 3.6 1.4 0.2 setosa 5
Change the format of variable names.
string <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width" )
gsub(".", "-", string, fixed = TRUE)
## [1] "Sepal-Length" "Sepal-Width" "Petal-Length" "Petal-Width"
Split one variables (such as 4 variables to 3 variables).
output <- iris %>%
mutate(Key=seq_len(n())) %>%
gather(flower_part, Value, -Species,-Key) %>%
separate(flower_part,into=c("Part","Measure")) %>%
spread(Measure, Value)
head(output, 5)
## Species Key Part Length Width
## 1 setosa 1 Petal 1.4 0.2
## 2 setosa 1 Sepal 5.1 3.5
## 3 setosa 2 Petal 1.4 0.2
## 4 setosa 2 Sepal 4.9 3.0
## 5 setosa 3 Petal 1.3 0.2
output %>% ggplot(aes(x=Width, y=Length,col=Species, shape=Species)) +
facet_grid(~Part) + geom_point()
Exchange variables of the dataset.I am sure there is a better way to do it.
con1 <- iris %>%
filter(Species == "setosa") %>%
select(-Species) %>%
gather(variables, setosa)
con2 <- iris %>%
filter(Species == "versicolor") %>%
select(-Species) %>%
gather(variables, versicolor)
con3 <- iris %>%
filter(Species == "virginica") %>%
select(-Species) %>%
gather(variables, virginica)
con4 <- cbind(con1, con2[2], con3[2])
head(con4, 5)
## variables setosa versicolor virginica
## 1 Sepal.Length 5.1 7.0 6.3
## 2 Sepal.Length 4.9 6.4 5.8
## 3 Sepal.Length 4.7 6.9 7.1
## 4 Sepal.Length 4.6 5.5 6.3
## 5 Sepal.Length 5.0 6.5 6.5
Add new variables to the dataset.
dfdate <- mtcars %>%
select(1,2,4,9,10) %>%
mutate(Var1 = mpg/hp,
time = ymd(today()))
head(dfdate, 5)
## mpg cyl hp am gear Var1 time
## 1 21.0 6 110 1 4 0.1909091 2018-11-20
## 2 21.0 6 110 1 4 0.1909091 2018-11-20
## 3 22.8 4 93 1 4 0.2451613 2018-11-20
## 4 21.4 6 110 0 3 0.1945455 2018-11-20
## 5 18.7 8 175 0 3 0.1068571 2018-11-20
Manipulate the formate of date.
dfdate$wday <- wday(dfdate$time, label = TRUE)
head(dfdate, 5)
## mpg cyl hp am gear Var1 time wday
## 1 21.0 6 110 1 4 0.1909091 2018-11-20 Tue
## 2 21.0 6 110 1 4 0.1909091 2018-11-20 Tue
## 3 22.8 4 93 1 4 0.2451613 2018-11-20 Tue
## 4 21.4 6 110 0 3 0.1945455 2018-11-20 Tue
## 5 18.7 8 175 0 3 0.1068571 2018-11-20 Tue
Factorize categorical variables.
dfdate$am.lbl <- factor(dfdate$am, labels = c("Automatic", "Manual"))
head(dfdate, 5)
## mpg cyl hp am gear Var1 time wday am.lbl
## 1 21.0 6 110 1 4 0.1909091 2018-11-20 Tue Manual
## 2 21.0 6 110 1 4 0.1909091 2018-11-20 Tue Manual
## 3 22.8 4 93 1 4 0.2451613 2018-11-20 Tue Manual
## 4 21.4 6 110 0 3 0.1945455 2018-11-20 Tue Automatic
## 5 18.7 8 175 0 3 0.1068571 2018-11-20 Tue Automatic
aggregate(mpg~am.lbl, data = dfdate, FUN = mean)
## am.lbl mpg
## 1 Automatic 17.14737
## 2 Manual 24.39231
Export the dateset in csv.
final <- mtcars %>%
select(1,2,4,9,10) %>%
group_by(am, gear) %>%
summarise(Mean = mean(hp),
SD = sd(hp)) %>%
arrange(-Mean)
head(final,5)
## # A tibble: 4 x 4
## # Groups: am [2]
## am gear Mean SD
## <dbl> <dbl> <dbl> <dbl>
## 1 1. 5. 196. 103.
## 2 0. 3. 176. 47.7
## 3 0. 4. 101. 29.0
## 4 1. 4. 83.9 24.2
write.csv(final, file = "final.csv",row.names=FALSE)
Slice a part of dataset.
mtcars %>%
slice(1:5)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## 4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Filter across the whole dataset.
mtcars %>%
filter_all(any_vars(. > 150)) %>%
head(5)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 3 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 4 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## 5 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
Filter based on required conditions.
mtcars %>%
filter(cyl == 6, mpg >= 20)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 3 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
mtcars %>%
filter(cyl %in% c(4, 6)) %>%
head(5)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## 4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 5 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
Filter using regex.
mtcars %>%
rownames_to_column %>%
filter(str_detect(rowname, "Merc|Toy"))
## rowname mpg cyl disp hp drat wt qsec vs am gear carb
## 1 Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 2 Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 3 Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 4 Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 5 Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 6 Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 7 Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 8 Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 9 Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
mtcars %>%
rownames_to_column %>%
filter(str_detect(rowname, "^L"))
## rowname mpg cyl disp hp drat wt qsec vs am gear carb
## 1 Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 2 Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
mtcars %>%
rownames_to_column %>%
filter(str_detect(rowname, "L$"))
## rowname mpg cyl disp hp drat wt qsec vs am gear carb
## 1 Merc 450SL 17.3 8 275.8 180 3.07 3.73 17.6 0 0 3 3
## 2 Ford Pantera L 15.8 8 351.0 264 4.22 3.17 14.5 0 1 5 4
mtcars %>%
rownames_to_column %>%
filter(str_detect(rowname, "\\d"))
## rowname mpg cyl disp hp drat wt qsec vs am gear carb
## 1 Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 4 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 5 Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 6 Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 7 Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 8 Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 9 Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 10 Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 11 Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 12 Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 13 Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 14 Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 15 Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 16 Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 17 Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
mtcars %>%
rownames_to_column %>%
filter(!str_detect(rowname, "\\d"))
## rowname mpg cyl disp hp drat wt qsec vs am gear carb
## 1 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 2 Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 3 Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 4 Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 5 Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 6 Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 7 Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 8 Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 9 Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 10 AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 11 Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 12 Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 13 Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 14 Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 15 Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
Highlight the subset when plotting.
ggplot(mtcars, aes(wt, mpg, colour = cyl, use_group_by=FALSE)) +
geom_point() +
gghighlight(disp > 460, label_key = am) +
scale_x_discrete(name ="wt")
R has a very rich statistical library.
model1 <- lm(mpg~., data = mtcars)
summary(model1)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
par(mfrow = c(2,2))
plot(model1)
Build regression mdoels by trying every two possible variables in the dataset.
setdiff(names(mtcars), "mpg") %>%
combn(2, paste, collapse = " + ") %>%
structure(., names = .) %>%
map(~ formula(paste("mpg ~ ", .x))) %>%
map(lm, data = mtcars) %>%
map_df(tidy, .id = "predictors") %>%
arrange(-std.error) %>%
head(20)
## predictors term estimate std.error statistic p.value
## 1 qsec + carb (Intercept) 19.139362 13.847403 1.3821625 1.774758e-01
## 2 qsec + vs (Intercept) 26.182530 11.625525 2.2521589 3.205404e-02
## 3 hp + qsec (Intercept) 48.323705 11.103306 4.3521906 1.526469e-04
## 4 qsec + gear (Intercept) -30.710845 9.670228 -3.1758139 3.529643e-03
## 5 cyl + qsec (Intercept) 47.381699 8.806606 5.3802454 8.819050e-06
## 6 drat + qsec (Intercept) -27.839918 8.299227 -3.3545196 2.229131e-03
## 7 cyl + drat (Intercept) 28.724665 7.592143 3.7834726 7.178250e-04
## 8 drat + wt (Intercept) 30.290370 7.317878 4.1392285 2.737824e-04
## 9 disp + qsec (Intercept) 25.504508 7.184094 3.5501356 1.335871e-03
## 10 drat + am (Intercept) -1.949883 7.073285 -0.2756687 7.847574e-01
## 11 disp + drat (Intercept) 21.844880 6.747971 3.2372516 3.016655e-03
## 12 qsec + am (Intercept) -18.889281 6.596973 -2.8633255 7.710583e-03
## 13 am + gear (Intercept) 16.864675 6.511673 2.5899142 1.486255e-02
## 14 drat + gear (Intercept) -7.537474 5.580198 -1.3507539 1.872206e-01
## 15 wt + qsec (Intercept) 19.746223 5.252062 3.7597088 7.650466e-04
## 16 wt + gear (Intercept) 38.915653 5.097397 7.6344173 2.037781e-08
## 17 hp + drat (Intercept) 10.789861 5.077752 2.1249289 4.223773e-02
## 18 cyl + gear (Intercept) 34.659497 4.936925 7.0204633 1.013557e-07
## 19 drat + vs (Intercept) -1.825317 4.902758 -0.3723041 7.123734e-01
## 20 disp + gear (Intercept) 29.104942 4.485919 6.4880668 4.214144e-07
K-mean cluster.
iriscaled <- scale(iris[,-5])
fitk <- kmeans(iriscaled, 3, nstart = 100)
plot(iris, col = fitk$cluster)
fitk$size
## [1] 50 47 53
table(iris$Species, fitk$cluster)
##
## 1 2 3
## setosa 50 0 0
## versicolor 0 11 39
## virginica 0 36 14