Indexing
x <- c(2,3, 6,4, 7,4, 100)
x[c(1,4)]
## [1] 2 4
#drop element
x[-1]
## [1] 3 6 4 7 4 100
#change element of vector
x[2]<- 999
x
## [1] 2 999 6 4 7 4 100
Data Frames
nrow(mtcars) # number of rows
## [1] 32
ncol(mtcars) # number of columns
## [1] 11
dim(mtcars)
## [1] 32 11
names(mtcars) # names of the variable or columns
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
row.names(mtcars) # names of the rows
## [1] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710"
## [4] "Hornet 4 Drive" "Hornet Sportabout" "Valiant"
## [7] "Duster 360" "Merc 240D" "Merc 230"
## [10] "Merc 280" "Merc 280C" "Merc 450SE"
## [13] "Merc 450SL" "Merc 450SLC" "Cadillac Fleetwood"
## [16] "Lincoln Continental" "Chrysler Imperial" "Fiat 128"
## [19] "Honda Civic" "Toyota Corolla" "Toyota Corona"
## [22] "Dodge Challenger" "AMC Javelin" "Camaro Z28"
## [25] "Pontiac Firebird" "Fiat X1-9" "Porsche 914-2"
## [28] "Lotus Europa" "Ford Pantera L" "Ferrari Dino"
## [31] "Maserati Bora" "Volvo 142E"
min(mtcars$mpg) #min of mpg
## [1] 10.4
mean(mtcars$mpg)
## [1] 20.09062
mtcars[,1] # extract first column and all rows
## [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
## [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
## [31] 15.0 21.4
mtcars[1,] #extract first row and all column
mtcars['Datsun 710',] # extract row for Datsun 710
mtcars[, 'mpg']
## [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
## [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
## [31] 15.0 21.4
mtcars[2,3] # extract element in the second row and third column
## [1] 160
mtcars[2:3,] #extract second and third row
mtcars[c(2,3),]
mtcars[-c(1:5),] # all rows except 1:5
Plot
plot(mtcars$wt, mtcars$mpg, xlab = "Wt.", ylab = 'MPG') #scatterplot
data_url <- "https://github.com/ericwfox/stat630data/raw/master/cdc.csv" #string for cdc data
cdc <- read.csv(data_url, header = TRUE)
names(cdc)
## [1] "genhlth" "exerany" "hlthplan" "smoke100" "height" "weight" "wtdesire"
## [8] "age" "gender"
dim(cdc)
## [1] 20000 9
summary(cdc$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 68.0 140.0 165.0 169.7 190.0 500.0
sd(cdc$weight) # standard deviation
## [1] 40.08097
table(cdc$smoke100) # for catergorical variable
##
## 0 1
## 10559 9441
round(table(cdc$smoke100)/20000,2) # relative frequency, at two decimal point
##
## 0 1
## 0.53 0.47
barplot(table(cdc$smoke100))
addmargins(table(cdc$gender, cdc$smoke100)) #add margins for row and column totals
##
## 0 1 Sum
## f 6012 4419 10431
## m 4547 5022 9569
## Sum 10559 9441 20000
subset
cdc10 <- cdc[1:10,]
cdc10$gender
## [1] m f f f f f m m f m
## Levels: f m
subset(cdc10, gender =='m') #filter males
subset(cdc10, gender =='m' & weight>140) # filter male and >140
subset(cdc10, gender =='m' | weight>140) # filter male or >140
cdc_m <- subset(cdc, gender =='m') # males only
summary(cdc_m$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 78.0 165.0 185.0 189.3 210.0 500.0
cdc_m_over50 <- subset(cdc, gender == 'm' & age >50) # males > 50 yrs
summary(cdc_m_over50$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 78.0 165.0 185.0 188.6 208.0 400.0
plot(cdc$weight, cdc$wtdesire, xlab = "WT.", ylab = 'Desired WT.', cex = 0.6, abline(0,1))
# cex size of dot and abline is line
table(cdc$smoke100) # frequency table
##
## 0 1
## 10559 9441
table(cdc$smoke100)/20000 # relative ft
##
## 0 1
## 0.52795 0.47205
barplot(table(cdc$smoke100), xlab = "smoked at least 100 cigarettes", names.arg = c("no","yes"), ylab = "count")
barplot(table(cdc$smoke100)/20000, xlab = "smoked at least 100 cigarettes", names.arg = c("no","yes"), ylab = "proportion")
addmargins(table(cdc$smoke100, cdc$gender)) # contingency table
##
## f m Sum
## 0 6012 4547 10559
## 1 4419 5022 9441
## Sum 10431 9569 20000
#segmented bar plot
barplot(table(cdc$smoke100, cdc$gender), col = c("skyblue","lightgreen"), beside = TRUE)
#stacked bar plot
barplot(table(cdc$smoke100, cdc$gender), col = c("skyblue","lightgreen"))
# adding a legend
barplot(table(cdc$smoke100, cdc$gender), col = c("skyblue","lightgreen"), beside = TRUE, ylab = 'counts', ylim = c(0,7000))
legend(x = "topright", c('no', 'yes'), col = c("skyblue","lightgreen"), title = 'smokes', pch =18)
prop.table(table(cdc$smoke100, cdc$gender))
##
## f m
## 0 0.30060 0.22735
## 1 0.22095 0.25110
table(cdc$smoke100, cdc$gender)/20000 # same as prop.table
##
## f m
## 0 0.30060 0.22735
## 1 0.22095 0.25110
#Factors
class(cdc$genhlth)
## [1] "factor"
levels(cdc$genhlth) #factors levels or names
## [1] "excellent" "fair" "good" "poor" "very good"
table(cdc$genhlth)
##
## excellent fair good poor very good
## 4657 2019 5675 677 6972
cdc$genhlth <- factor(cdc$genhlth, levels = c("poor", "fair", "good", "very good", "excellent")) # reorder labels
table(cdc$genhlth)
##
## poor fair good very good excellent
## 677 2019 5675 6972 4657
barplot(table(cdc$genhlth), xlab = "general health", ylab = "count", col = "green")
# Histogram, for continuous variable
sort(mtcars$mpg)
## [1] 10.4 10.4 13.3 14.3 14.7 15.0 15.2 15.2 15.5 15.8 16.4 17.3 17.8 18.1 18.7
## [16] 19.2 19.2 19.7 21.0 21.0 21.4 21.4 21.5 22.8 22.8 24.4 26.0 27.3 30.4 30.4
## [31] 32.4 33.9
hist(mtcars$mpg, main = "Histogram of MPG", xlab = "Miles per Gallon")
# Attributes
h1 <- hist(mtcars$mpg)
attributes(h1)
## $names
## [1] "breaks" "counts" "density" "mids" "xname" "equidist"
##
## $class
## [1] "histogram"
h1$density
## [1] 0.0375 0.0750 0.0500 0.0125 0.0250
h1$breaks
## [1] 10 15 20 25 30 35
h1$counts
## [1] 6 12 8 2 4
hist(cdc$weight) # default bins
hist(cdc$weight, breaks = 50, main = "50 bins")
# Histogram Density plot, freq = F
hist(mtcars$mpg, freq = F, main = "Density Plot of MPG")
hist(cdc$weight, freq = F)
lines(density(cdc$weight), col = "red") # density() to superimpose a smooth #density curve over the hist
# Box Plots
boxplot(cdc$weight, ylab = "Weight")
boxplot(cdc$weight~cdc$gender)
Maps
library(maps)
map("world")
map("state","california")
map("county","ca")
#EPA Stream Data Set
data_url <- "https://raw.githubusercontent.com/ericwfox/stat630data/master/nrsa.csv"
nrsa <- read.csv(data_url, header = TRUE)
head(nrsa)
map("state")
points(nrsa$lon, nrsa$lat, cex = 0.5)
map("state")
points(nrsa$lon, nrsa$lat, cex = 0.5, col = c("red","green","blue")[nrsa$cond2])
legend("bottomright", c("poor", "fair", "good"),
col=c("red","green","blue"), pch=1)
nrsa_good = subset(nrsa, cond=="Good")
map("state")
points(nrsa_good$lon, nrsa_good$lat, col= "red"
, cex=0.5)
pnorm(1.35) # to compute prob P(Z<1.35)
## [1] 0.911492
qnorm(0.99) # to compute z values
## [1] 2.326348
hist(rnorm(100)) # generate 100 random numbers and plot hist
hist(rnorm(100, 10,3)) # generate 10 random numbers from N(10,3)
# plot normal density function(pdf) using dnorm()
grd <- seq(-4, 4, by = 0.01)
y <- dnorm(grd)
plot(grd,y, type = 'l', xlab = 'x', ylab = 'f(x)', main = 'N(0,1)') # type = 'l' for lines
Assessing Norlality
# histogram density with normal dist curve
cdc_male <- subset(cdc, gender =='m')
hist(cdc_male$height, breaks = 50, prob =T, xlab = " Male Heights")
grd <- seq(50,90,0.01)
y <- dnorm(grd, mean = mean(cdc_male$height), sd = sd(cdc_male$height))
lines(grd, y, col = "darkgreen")
# QQ plot
qqnorm(cdc_male$height) # normal prob plot
qqline(cdc_male$height) # add line for reference
#Generate n = 1000 random numbers from N(0; 1) and plot the density histogram and normal probability plot.
par(mfrow = c(1,2)) # so the plots are next to each other
sim_norm1000 <- rnorm(1000)
hist(sim_norm1000, prob=T)
grd <- seq(-3, 3, 0.01)
y <- dnorm(grd, mean=mean(sim_norm1000), sd=sd(sim_norm1000))
lines(grd, y, col='red')
qqnorm(sim_norm1000)
qqline(sim_norm1000)
#Generate 1000 random numbers from a normal distribution with mean = 100 andstandard deviation = 20, and then plot the histogram. Also make a normal probability plot withthe random numbers you generated.
par(mfrow=c(1,2)) # so that plots are beside each other
set.seed(100)
p = rnorm(1000, mean=100, sd=20)
hist(p, main="N(100, 20)")
qqnorm(p)
#CDC Data
summary(cdc$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 68.0 140.0 165.0 169.7 190.0 500.0
par(mfrow = c(1,2))
hist(cdc$weight)
qqnorm(cdc$weight)
qqline(cdc$weight)
# 5000 samples
set.seed(999)
xbars <- rep(NA, 5000)
for(i in 1:5000) {
samp_i <- sample(cdc$weight, 30)
xbars[i] <- mean(samp_i)
}
par(mfrow=c(1,2), cex=0.6)
hist(xbars, xlab= "Mean weight", main=
"Histogram of Sample Means (n=30)")
qqnorm(xbars)
qqline(xbars)
#CDC Data
mu <- mean(cdc$weight)
round(mu,2)
## [1] 169.68
# 95% CI
sample1 <- sample(cdc$weight, 30)
round(qnorm(0.975),2) # z-critical value
## [1] 1.96
ci_lower <- mean(sample1) - 1.96 * sd(sample1)/sqrt(30)
ci_upper <- mean(sample1) + 1.96 * sd(sample1)/sqrt(30)
round(c(ci_lower,ci_upper),2)
## [1] 156.58 188.55