install.packages(c(“rmarkdown”, “knitr”))
R can evaluate expressions just like a calculator:
#Create a new chunk:
# Windows / Linux: Ctrl + Alt + I
# macOS:Cmd + Option + I
# +C
2 + 3
## [1] 5
8 / 3
## [1] 2.666667
2^3
## [1] 8
You can also use built-in constants:
pi
## [1] 3.141593
exp(1)
## [1] 2.718282
2*exp(1)
## [1] 5.436564
Assignment is done using <-:
height <- 60
height <- height + 5
height <- 2 * height
height
## [1] 130
Create a vector and apply operations:
#import data from the package
data(wage1, package = "wooldridge")
wages <- wage1$wage
head(wages)
## [1] 3.10 3.24 3.00 6.00 5.30 8.75
head(wages, n=20)
## [1] 3.10 3.24 3.00 6.00 5.30 8.75 11.25 5.00 3.60 18.18 6.25 8.13
## [13] 8.77 5.50 22.20 17.33 7.50 10.63 3.60 4.50
summary(wages)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.530 3.330 4.650 5.896 6.880 24.980
Indexing and logical filtering:
wages[wages > 10]
## [1] 11.25 18.18 22.20 17.33 10.63 12.50 12.50 13.00 13.70 21.63 11.71 12.39
## [13] 19.98 13.08 11.90 11.76 13.16 15.00 13.33 24.98 10.95 11.55 15.38 14.58
## [25] 12.50 21.86 11.11 22.86 18.16 10.91 18.00 18.89 13.95 18.16 11.98 12.22
## [37] 15.00 12.50 11.10 12.50 10.92 12.50 10.38 20.00 11.25 14.38 17.50 11.82
## [49] 12.50 17.71 15.00 11.56
which(wages > 20) #locate the position
## [1] 15 59 112 186 229
Check the structure and type of variables:
str(wage1)
## 'data.frame': 526 obs. of 24 variables:
## $ wage : num 3.1 3.24 3 6 5.3 ...
## $ educ : int 11 12 11 8 12 16 18 12 12 17 ...
## $ exper : int 2 22 2 44 7 9 15 5 26 22 ...
## $ tenure : int 0 2 0 28 2 8 7 3 4 21 ...
## $ nonwhite: int 0 0 0 0 0 0 0 0 0 0 ...
## $ female : int 1 1 0 0 0 0 0 1 1 0 ...
## $ married : int 0 1 0 1 1 1 0 0 0 1 ...
## $ numdep : int 2 3 2 0 1 0 0 0 2 0 ...
## $ smsa : int 1 1 0 1 0 1 1 1 1 1 ...
## $ northcen: int 0 0 0 0 0 0 0 0 0 0 ...
## $ south : int 0 0 0 0 0 0 0 0 0 0 ...
## $ west : int 1 1 1 1 1 1 1 1 1 1 ...
## $ construc: int 0 0 0 0 0 0 0 0 0 0 ...
## $ ndurman : int 0 0 0 0 0 0 0 0 0 0 ...
## $ trcommpu: int 0 0 0 0 0 0 0 0 0 0 ...
## $ trade : int 0 0 1 0 0 0 1 0 1 0 ...
## $ services: int 0 1 0 0 0 0 0 0 0 0 ...
## $ profserv: int 0 0 0 0 0 1 0 0 0 0 ...
## $ profocc : int 0 0 0 0 0 1 1 1 1 1 ...
## $ clerocc : int 0 0 0 1 0 0 0 0 0 0 ...
## $ servocc : int 0 1 0 0 0 0 0 0 0 0 ...
## $ lwage : num 1.13 1.18 1.1 1.79 1.67 ...
## $ expersq : int 4 484 4 1936 49 81 225 25 676 484 ...
## $ tenursq : int 0 4 0 784 4 64 49 9 16 441 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
typeof(wage1$wage)
## [1] "double"
is.factor(wage1$female)
## [1] FALSE
Convert numeric to factor, inspect categories:
wage1$female <- factor(wage1$female, labels = c("Male", "Female"))
table(wage1$female)
##
## Male Female
## 274 252
Missing data appears as NA. Functions like
mean() will return NA unless handled.
x <- c(1, 2, NA, 4)
mean(x)
## [1] NA
mean(x, na.rm = TRUE)
## [1] 2.333333
Explore and filter data frames:
str(wage1) #'wages' is a created vector, but 'wage1' is a data frame.
## 'data.frame': 526 obs. of 24 variables:
## $ wage : num 3.1 3.24 3 6 5.3 ...
## $ educ : int 11 12 11 8 12 16 18 12 12 17 ...
## $ exper : int 2 22 2 44 7 9 15 5 26 22 ...
## $ tenure : int 0 2 0 28 2 8 7 3 4 21 ...
## $ nonwhite: int 0 0 0 0 0 0 0 0 0 0 ...
## $ female : Factor w/ 2 levels "Male","Female": 2 2 1 1 1 1 1 2 2 1 ...
## $ married : int 0 1 0 1 1 1 0 0 0 1 ...
## $ numdep : int 2 3 2 0 1 0 0 0 2 0 ...
## $ smsa : int 1 1 0 1 0 1 1 1 1 1 ...
## $ northcen: int 0 0 0 0 0 0 0 0 0 0 ...
## $ south : int 0 0 0 0 0 0 0 0 0 0 ...
## $ west : int 1 1 1 1 1 1 1 1 1 1 ...
## $ construc: int 0 0 0 0 0 0 0 0 0 0 ...
## $ ndurman : int 0 0 0 0 0 0 0 0 0 0 ...
## $ trcommpu: int 0 0 0 0 0 0 0 0 0 0 ...
## $ trade : int 0 0 1 0 0 0 1 0 1 0 ...
## $ services: int 0 1 0 0 0 0 0 0 0 0 ...
## $ profserv: int 0 0 0 0 0 1 0 0 0 0 ...
## $ profocc : int 0 0 0 0 0 1 1 1 1 1 ...
## $ clerocc : int 0 0 0 1 0 0 0 0 0 0 ...
## $ servocc : int 0 1 0 0 0 0 0 0 0 0 ...
## $ lwage : num 1.13 1.18 1.1 1.79 1.67 ...
## $ expersq : int 4 484 4 1936 49 81 225 25 676 484 ...
## $ tenursq : int 0 4 0 784 4 64 49 9 16 441 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
names(wage1) #present all the variables' names
## [1] "wage" "educ" "exper" "tenure" "nonwhite" "female"
## [7] "married" "numdep" "smsa" "northcen" "south" "west"
## [13] "construc" "ndurman" "trcommpu" "trade" "services" "profserv"
## [19] "profocc" "clerocc" "servocc" "lwage" "expersq" "tenursq"
#
library(dplyr)
glimpse(wage1)
## Rows: 526
## Columns: 24
## $ wage <dbl> 3.10, 3.24, 3.00, 6.00, 5.30, 8.75, 11.25, 5.00, 3.60, 18.18,…
## $ educ <int> 11, 12, 11, 8, 12, 16, 18, 12, 12, 17, 16, 13, 12, 12, 12, 16…
## $ exper <int> 2, 22, 2, 44, 7, 9, 15, 5, 26, 22, 8, 3, 15, 18, 31, 14, 10, …
## $ tenure <int> 0, 2, 0, 28, 2, 8, 7, 3, 4, 21, 2, 0, 0, 3, 15, 0, 0, 10, 0, …
## $ nonwhite <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ female <fct> Female, Female, Male, Male, Male, Male, Male, Female, Female,…
## $ married <int> 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0…
## $ numdep <int> 2, 3, 2, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 1, 1, 0, 0, 3, 0, 0…
## $ smsa <int> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ northcen <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ south <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ west <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ construc <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ndurman <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ trcommpu <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ trade <int> 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ services <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ profserv <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1…
## $ profocc <int> 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1…
## $ clerocc <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ servocc <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ lwage <dbl> 1.1314021, 1.1755733, 1.0986123, 1.7917595, 1.6677068, 2.1690…
## $ expersq <int> 4, 484, 4, 1936, 49, 81, 225, 25, 676, 484, 64, 9, 225, 324, …
## $ tenursq <int> 0, 4, 0, 784, 4, 64, 49, 9, 16, 441, 4, 0, 0, 9, 225, 0, 0, 1…
head(wage1)
## wage educ exper tenure nonwhite female married numdep smsa northcen south
## 1 3.10 11 2 0 0 Female 0 2 1 0 0
## 2 3.24 12 22 2 0 Female 1 3 1 0 0
## 3 3.00 11 2 0 0 Male 0 2 0 0 0
## 4 6.00 8 44 28 0 Male 1 0 1 0 0
## 5 5.30 12 7 2 0 Male 1 1 0 0 0
## 6 8.75 16 9 8 0 Male 1 0 1 0 0
## west construc ndurman trcommpu trade services profserv profocc clerocc
## 1 1 0 0 0 0 0 0 0 0
## 2 1 0 0 0 0 1 0 0 0
## 3 1 0 0 0 1 0 0 0 0
## 4 1 0 0 0 0 0 0 0 1
## 5 1 0 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 1 1 0
## servocc lwage expersq tenursq
## 1 0 1.131402 4 0
## 2 1 1.175573 484 4
## 3 0 1.098612 4 0
## 4 0 1.791759 1936 784
## 5 0 1.667707 49 4
## 6 0 2.169054 81 64
summary(wage1)
## wage educ exper tenure
## Min. : 0.530 Min. : 0.00 Min. : 1.00 Min. : 0.000
## 1st Qu.: 3.330 1st Qu.:12.00 1st Qu.: 5.00 1st Qu.: 0.000
## Median : 4.650 Median :12.00 Median :13.50 Median : 2.000
## Mean : 5.896 Mean :12.56 Mean :17.02 Mean : 5.105
## 3rd Qu.: 6.880 3rd Qu.:14.00 3rd Qu.:26.00 3rd Qu.: 7.000
## Max. :24.980 Max. :18.00 Max. :51.00 Max. :44.000
## nonwhite female married numdep
## Min. :0.0000 Male :274 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 Female:252 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :1.0000 Median :1.000
## Mean :0.1027 Mean :0.6084 Mean :1.044
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:2.000
## Max. :1.0000 Max. :1.0000 Max. :6.000
## smsa northcen south west
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.7224 Mean :0.251 Mean :0.3555 Mean :0.1692
## 3rd Qu.:1.0000 3rd Qu.:0.750 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.0000
## construc ndurman trcommpu trade
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.00000 Median :0.0000
## Mean :0.04563 Mean :0.1141 Mean :0.04373 Mean :0.2871
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## services profserv profocc clerocc
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1008 Mean :0.2586 Mean :0.3669 Mean :0.1673
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## servocc lwage expersq tenursq
## Min. :0.0000 Min. :-0.6349 Min. : 1.0 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.: 1.2030 1st Qu.: 25.0 1st Qu.: 0.00
## Median :0.0000 Median : 1.5369 Median : 182.5 Median : 4.00
## Mean :0.1407 Mean : 1.6233 Mean : 473.4 Mean : 78.15
## 3rd Qu.:0.0000 3rd Qu.: 1.9286 3rd Qu.: 676.0 3rd Qu.: 49.00
## Max. :1.0000 Max. : 3.2181 Max. :2601.0 Max. :1936.00
summary(wage1$educ)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 12.00 12.00 12.56 14.00 18.00
high_edu <- wage1[wage1$educ > 16, ]
# df[rows, columns] — blank means "all"; e.g., df[, 2] = all rows, column 2
head(high_edu)
## wage educ exper tenure nonwhite female married numdep smsa northcen south
## 7 11.25 18 15 7 0 Male 0 0 1 0 0
## 10 18.18 17 22 21 0 Male 1 0 1 0 0
## 59 21.63 18 8 8 0 Female 0 0 1 0 0
## 72 13.08 17 17 2 1 Male 1 3 1 0 0
## 80 7.14 18 13 0 0 Male 1 2 1 0 0
## 110 9.80 17 7 0 0 Male 1 0 1 0 0
## west construc ndurman trcommpu trade services profserv profocc clerocc
## 7 1 0 0 0 1 0 0 1 0
## 10 1 0 0 0 0 0 0 1 0
## 59 0 0 0 0 0 0 1 1 0
## 72 0 0 0 0 0 0 1 1 0
## 80 0 0 0 0 0 0 1 1 0
## 110 0 0 0 0 0 1 0 1 0
## servocc lwage expersq tenursq
## 7 0 2.420368 225 49
## 10 0 2.900322 484 441
## 59 0 3.074081 64 64
## 72 0 2.571084 289 4
## 80 0 1.965713 169 0
## 110 0 2.282382 49 0
#Create a sub-sample
subset(wage1, wage > 10 & exper > 5)
wage_subset <- subset(wage1, wage > 10 & exper > 5) #for further analysis
# chunk option: eval=FALSE —> shows the code in the output but does not run it.
# echo=FALSE -> shows the results only
library(readr)
library(readxl)
# Write to CSV
write_csv(wage1, "wage1_output.csv")
#read .csv file
df <- read_csv("wage1_output.csv")
read_excel("wage1_output.xlsx")
Load a package, or access functions selectively:
#Already loaded: wooldridge, dplyr
head(wooldridge::wage1)
select(wage1, wage, educ)
Function name conflicts can occur. For example, select()
exists in both MASS and dplyr.
# Use dplyr version explicitly
library(MASS)
#select(wage1, exper, wage) #error occurs
dplyr::select(wage1, exper, wage)
Common mistakes:
# Object not found
#mean(wgae)
# Invalid index
#wage1[, 200] # df 'wage1' only has 24 columns
# Comparison vs assignment
wage1$wage = 5 # assign value 5 to all the obersevation of wage
wage1$wage == 5 # comparison
Warnings do not stop execution but signal issues:
a <- 1:4
a[5] <- 100:200 # vector recycling warning
## Warning in a[5] <- 100:200: 被替换的项目不是替换值长度的倍数
Use debugging tools:
traceback()
## 无traceback
print("Use print() inside functions to debug")
## [1] "Use print() inside functions to debug"
Use ggplot2 to visualize variable distribution:
ggplot(wage1, aes(x = wage)) +
geom_histogram(bins = 30, fill = "steelblue") +
labs(title = "Wage Distribution", x = "Wage", y = "Count")
You can mix code and explanation in one document.
---
title: "Homework 1"
author: "Riley Student"
output: html_document
---
mean(x)**bold***italic*1., -, or *Inline: \(alpha + beta X\)
Block:
\[ \hat{\beta} = (X'X)^{-1}X'y \]