install.packages(c(“rmarkdown”, “knitr”))

Arithmetic and Assignment

R can evaluate expressions just like a calculator:

#Create a new chunk:
# Windows / Linux: Ctrl + Alt + I
# macOS:Cmd + Option + I
# +C
2 + 3
## [1] 5
8 / 3
## [1] 2.666667
2^3
## [1] 8

You can also use built-in constants:

pi
## [1] 3.141593
exp(1)
## [1] 2.718282
2*exp(1)
## [1] 5.436564

Assignment is done using <-:

height <- 60
height <- height + 5
height <- 2 * height

height
## [1] 130

Vectors and Indexing

Create a vector and apply operations:

#import data from the package
data(wage1, package = "wooldridge")

wages <- wage1$wage
head(wages)
## [1] 3.10 3.24 3.00 6.00 5.30 8.75
head(wages, n=20)
##  [1]  3.10  3.24  3.00  6.00  5.30  8.75 11.25  5.00  3.60 18.18  6.25  8.13
## [13]  8.77  5.50 22.20 17.33  7.50 10.63  3.60  4.50
summary(wages)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.530   3.330   4.650   5.896   6.880  24.980

Indexing and logical filtering:

wages[wages > 10]
##  [1] 11.25 18.18 22.20 17.33 10.63 12.50 12.50 13.00 13.70 21.63 11.71 12.39
## [13] 19.98 13.08 11.90 11.76 13.16 15.00 13.33 24.98 10.95 11.55 15.38 14.58
## [25] 12.50 21.86 11.11 22.86 18.16 10.91 18.00 18.89 13.95 18.16 11.98 12.22
## [37] 15.00 12.50 11.10 12.50 10.92 12.50 10.38 20.00 11.25 14.38 17.50 11.82
## [49] 12.50 17.71 15.00 11.56
which(wages > 20) #locate the position
## [1]  15  59 112 186 229

Data Types

Check the structure and type of variables:

str(wage1)
## 'data.frame':    526 obs. of  24 variables:
##  $ wage    : num  3.1 3.24 3 6 5.3 ...
##  $ educ    : int  11 12 11 8 12 16 18 12 12 17 ...
##  $ exper   : int  2 22 2 44 7 9 15 5 26 22 ...
##  $ tenure  : int  0 2 0 28 2 8 7 3 4 21 ...
##  $ nonwhite: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ female  : int  1 1 0 0 0 0 0 1 1 0 ...
##  $ married : int  0 1 0 1 1 1 0 0 0 1 ...
##  $ numdep  : int  2 3 2 0 1 0 0 0 2 0 ...
##  $ smsa    : int  1 1 0 1 0 1 1 1 1 1 ...
##  $ northcen: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ south   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ west    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ construc: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ndurman : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trcommpu: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trade   : int  0 0 1 0 0 0 1 0 1 0 ...
##  $ services: int  0 1 0 0 0 0 0 0 0 0 ...
##  $ profserv: int  0 0 0 0 0 1 0 0 0 0 ...
##  $ profocc : int  0 0 0 0 0 1 1 1 1 1 ...
##  $ clerocc : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ servocc : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ lwage   : num  1.13 1.18 1.1 1.79 1.67 ...
##  $ expersq : int  4 484 4 1936 49 81 225 25 676 484 ...
##  $ tenursq : int  0 4 0 784 4 64 49 9 16 441 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
typeof(wage1$wage)
## [1] "double"
is.factor(wage1$female)
## [1] FALSE

Factors and Tables

Convert numeric to factor, inspect categories:

wage1$female <- factor(wage1$female, labels = c("Male", "Female"))
table(wage1$female)
## 
##   Male Female 
##    274    252

Missing Values

Missing data appears as NA. Functions like mean() will return NA unless handled.

x <- c(1, 2, NA, 4)
mean(x)
## [1] NA
mean(x, na.rm = TRUE)
## [1] 2.333333

Data Frames and Filtering

Explore and filter data frames:

str(wage1) #'wages' is a created vector, but 'wage1' is a data frame.
## 'data.frame':    526 obs. of  24 variables:
##  $ wage    : num  3.1 3.24 3 6 5.3 ...
##  $ educ    : int  11 12 11 8 12 16 18 12 12 17 ...
##  $ exper   : int  2 22 2 44 7 9 15 5 26 22 ...
##  $ tenure  : int  0 2 0 28 2 8 7 3 4 21 ...
##  $ nonwhite: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ female  : Factor w/ 2 levels "Male","Female": 2 2 1 1 1 1 1 2 2 1 ...
##  $ married : int  0 1 0 1 1 1 0 0 0 1 ...
##  $ numdep  : int  2 3 2 0 1 0 0 0 2 0 ...
##  $ smsa    : int  1 1 0 1 0 1 1 1 1 1 ...
##  $ northcen: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ south   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ west    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ construc: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ndurman : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trcommpu: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trade   : int  0 0 1 0 0 0 1 0 1 0 ...
##  $ services: int  0 1 0 0 0 0 0 0 0 0 ...
##  $ profserv: int  0 0 0 0 0 1 0 0 0 0 ...
##  $ profocc : int  0 0 0 0 0 1 1 1 1 1 ...
##  $ clerocc : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ servocc : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ lwage   : num  1.13 1.18 1.1 1.79 1.67 ...
##  $ expersq : int  4 484 4 1936 49 81 225 25 676 484 ...
##  $ tenursq : int  0 4 0 784 4 64 49 9 16 441 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
names(wage1) #present all the variables' names
##  [1] "wage"     "educ"     "exper"    "tenure"   "nonwhite" "female"  
##  [7] "married"  "numdep"   "smsa"     "northcen" "south"    "west"    
## [13] "construc" "ndurman"  "trcommpu" "trade"    "services" "profserv"
## [19] "profocc"  "clerocc"  "servocc"  "lwage"    "expersq"  "tenursq"
#
library(dplyr)
glimpse(wage1)
## Rows: 526
## Columns: 24
## $ wage     <dbl> 3.10, 3.24, 3.00, 6.00, 5.30, 8.75, 11.25, 5.00, 3.60, 18.18,…
## $ educ     <int> 11, 12, 11, 8, 12, 16, 18, 12, 12, 17, 16, 13, 12, 12, 12, 16…
## $ exper    <int> 2, 22, 2, 44, 7, 9, 15, 5, 26, 22, 8, 3, 15, 18, 31, 14, 10, …
## $ tenure   <int> 0, 2, 0, 28, 2, 8, 7, 3, 4, 21, 2, 0, 0, 3, 15, 0, 0, 10, 0, …
## $ nonwhite <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ female   <fct> Female, Female, Male, Male, Male, Male, Male, Female, Female,…
## $ married  <int> 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0…
## $ numdep   <int> 2, 3, 2, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 1, 1, 0, 0, 3, 0, 0…
## $ smsa     <int> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ northcen <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ south    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ west     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ construc <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ ndurman  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ trcommpu <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ trade    <int> 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ services <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ profserv <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1…
## $ profocc  <int> 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1…
## $ clerocc  <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ servocc  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ lwage    <dbl> 1.1314021, 1.1755733, 1.0986123, 1.7917595, 1.6677068, 2.1690…
## $ expersq  <int> 4, 484, 4, 1936, 49, 81, 225, 25, 676, 484, 64, 9, 225, 324, …
## $ tenursq  <int> 0, 4, 0, 784, 4, 64, 49, 9, 16, 441, 4, 0, 0, 9, 225, 0, 0, 1…
head(wage1)
##   wage educ exper tenure nonwhite female married numdep smsa northcen south
## 1 3.10   11     2      0        0 Female       0      2    1        0     0
## 2 3.24   12    22      2        0 Female       1      3    1        0     0
## 3 3.00   11     2      0        0   Male       0      2    0        0     0
## 4 6.00    8    44     28        0   Male       1      0    1        0     0
## 5 5.30   12     7      2        0   Male       1      1    0        0     0
## 6 8.75   16     9      8        0   Male       1      0    1        0     0
##   west construc ndurman trcommpu trade services profserv profocc clerocc
## 1    1        0       0        0     0        0        0       0       0
## 2    1        0       0        0     0        1        0       0       0
## 3    1        0       0        0     1        0        0       0       0
## 4    1        0       0        0     0        0        0       0       1
## 5    1        0       0        0     0        0        0       0       0
## 6    1        0       0        0     0        0        1       1       0
##   servocc    lwage expersq tenursq
## 1       0 1.131402       4       0
## 2       1 1.175573     484       4
## 3       0 1.098612       4       0
## 4       0 1.791759    1936     784
## 5       0 1.667707      49       4
## 6       0 2.169054      81      64
summary(wage1)
##       wage             educ           exper           tenure      
##  Min.   : 0.530   Min.   : 0.00   Min.   : 1.00   Min.   : 0.000  
##  1st Qu.: 3.330   1st Qu.:12.00   1st Qu.: 5.00   1st Qu.: 0.000  
##  Median : 4.650   Median :12.00   Median :13.50   Median : 2.000  
##  Mean   : 5.896   Mean   :12.56   Mean   :17.02   Mean   : 5.105  
##  3rd Qu.: 6.880   3rd Qu.:14.00   3rd Qu.:26.00   3rd Qu.: 7.000  
##  Max.   :24.980   Max.   :18.00   Max.   :51.00   Max.   :44.000  
##     nonwhite         female       married           numdep     
##  Min.   :0.0000   Male  :274   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   Female:252   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000                Median :1.0000   Median :1.000  
##  Mean   :0.1027                Mean   :0.6084   Mean   :1.044  
##  3rd Qu.:0.0000                3rd Qu.:1.0000   3rd Qu.:2.000  
##  Max.   :1.0000                Max.   :1.0000   Max.   :6.000  
##       smsa           northcen         south             west       
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.7224   Mean   :0.251   Mean   :0.3555   Mean   :0.1692  
##  3rd Qu.:1.0000   3rd Qu.:0.750   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##     construc          ndurman          trcommpu           trade       
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.00000   Median :0.0000  
##  Mean   :0.04563   Mean   :0.1141   Mean   :0.04373   Mean   :0.2871  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##     services         profserv         profocc          clerocc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.1008   Mean   :0.2586   Mean   :0.3669   Mean   :0.1673  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     servocc           lwage            expersq          tenursq       
##  Min.   :0.0000   Min.   :-0.6349   Min.   :   1.0   Min.   :   0.00  
##  1st Qu.:0.0000   1st Qu.: 1.2030   1st Qu.:  25.0   1st Qu.:   0.00  
##  Median :0.0000   Median : 1.5369   Median : 182.5   Median :   4.00  
##  Mean   :0.1407   Mean   : 1.6233   Mean   : 473.4   Mean   :  78.15  
##  3rd Qu.:0.0000   3rd Qu.: 1.9286   3rd Qu.: 676.0   3rd Qu.:  49.00  
##  Max.   :1.0000   Max.   : 3.2181   Max.   :2601.0   Max.   :1936.00
summary(wage1$educ)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   12.00   12.00   12.56   14.00   18.00
high_edu <- wage1[wage1$educ > 16, ] 
# df[rows, columns] — blank means "all"; e.g., df[, 2] = all rows, column 2
head(high_edu)
##      wage educ exper tenure nonwhite female married numdep smsa northcen south
## 7   11.25   18    15      7        0   Male       0      0    1        0     0
## 10  18.18   17    22     21        0   Male       1      0    1        0     0
## 59  21.63   18     8      8        0 Female       0      0    1        0     0
## 72  13.08   17    17      2        1   Male       1      3    1        0     0
## 80   7.14   18    13      0        0   Male       1      2    1        0     0
## 110  9.80   17     7      0        0   Male       1      0    1        0     0
##     west construc ndurman trcommpu trade services profserv profocc clerocc
## 7      1        0       0        0     1        0        0       1       0
## 10     1        0       0        0     0        0        0       1       0
## 59     0        0       0        0     0        0        1       1       0
## 72     0        0       0        0     0        0        1       1       0
## 80     0        0       0        0     0        0        1       1       0
## 110    0        0       0        0     0        1        0       1       0
##     servocc    lwage expersq tenursq
## 7         0 2.420368     225      49
## 10        0 2.900322     484     441
## 59        0 3.074081      64      64
## 72        0 2.571084     289       4
## 80        0 1.965713     169       0
## 110       0 2.282382      49       0
#Create a sub-sample
subset(wage1, wage > 10 & exper > 5) 
wage_subset <- subset(wage1, wage > 10 & exper > 5) #for further analysis

Reading and Writing Data

# chunk option: eval=FALSE —> shows the code in the output but does not run it.
# echo=FALSE -> shows the results only
library(readr)
library(readxl)

# Write to CSV
write_csv(wage1, "wage1_output.csv")

#read .csv file
df <- read_csv("wage1_output.csv")
read_excel("wage1_output.xlsx")

Using Packages and Conflicts

Load a package, or access functions selectively:

#Already loaded: wooldridge, dplyr
head(wooldridge::wage1)
select(wage1, wage, educ)

Function name conflicts can occur. For example, select() exists in both MASS and dplyr.

# Use dplyr version explicitly
library(MASS)
#select(wage1, exper, wage) #error occurs
dplyr::select(wage1, exper, wage)

Errors and Warnings

Common mistakes:

# Object not found
#mean(wgae)

# Invalid index
#wage1[, 200] # df 'wage1' only has 24 columns

# Comparison vs assignment
wage1$wage = 5       # assign value 5 to all the obersevation of wage
wage1$wage == 5      # comparison

Warnings do not stop execution but signal issues:

a <- 1:4
a[5] <- 100:200  # vector recycling warning
## Warning in a[5] <- 100:200: 被替换的项目不是替换值长度的倍数

Use debugging tools:

traceback()
## 无traceback
print("Use print() inside functions to debug")
## [1] "Use print() inside functions to debug"

Visualization

Use ggplot2 to visualize variable distribution:

ggplot(wage1, aes(x = wage)) +
  geom_histogram(bins = 30, fill = "steelblue") +
  labs(title = "Wage Distribution", x = "Wage", y = "Count")

R Markdown Basics

You can mix code and explanation in one document.

YAML Header

---
title: "Homework 1"
author: "Riley Student"
output: html_document
---

Formatting Tips

  • Inline code: mean(x)
  • Bold: **bold**
  • Italic: *italic*
  • Lists: use 1., -, or *

Math Support

Inline: \(alpha + beta X\)
Block:

\[ \hat{\beta} = (X'X)^{-1}X'y \]

Further Learning