Introduction to Statistics with R

Lab 1

Indexing

x <- c(2,3, 6,4, 7,4, 100)
x[c(1,4)]
## [1] 2 4
#drop element
x[-1]
## [1]   3   6   4   7   4 100
#change element of vector
x[2]<- 999
x
## [1]   2 999   6   4   7   4 100

Data Frames

nrow(mtcars) # number of rows
## [1] 32
ncol(mtcars) # number of columns
## [1] 11
dim(mtcars)
## [1] 32 11
names(mtcars) # names of the variable or columns 
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"
row.names(mtcars) # names of the rows
##  [1] "Mazda RX4"           "Mazda RX4 Wag"       "Datsun 710"         
##  [4] "Hornet 4 Drive"      "Hornet Sportabout"   "Valiant"            
##  [7] "Duster 360"          "Merc 240D"           "Merc 230"           
## [10] "Merc 280"            "Merc 280C"           "Merc 450SE"         
## [13] "Merc 450SL"          "Merc 450SLC"         "Cadillac Fleetwood" 
## [16] "Lincoln Continental" "Chrysler Imperial"   "Fiat 128"           
## [19] "Honda Civic"         "Toyota Corolla"      "Toyota Corona"      
## [22] "Dodge Challenger"    "AMC Javelin"         "Camaro Z28"         
## [25] "Pontiac Firebird"    "Fiat X1-9"           "Porsche 914-2"      
## [28] "Lotus Europa"        "Ford Pantera L"      "Ferrari Dino"       
## [31] "Maserati Bora"       "Volvo 142E"
min(mtcars$mpg) #min of mpg
## [1] 10.4
mean(mtcars$mpg)
## [1] 20.09062
mtcars[,1] # extract first column and all rows
##  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
## [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
## [31] 15.0 21.4
mtcars[1,] #extract first row and all column
mtcars['Datsun 710',] # extract row for Datsun 710
mtcars[, 'mpg']
##  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
## [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
## [31] 15.0 21.4
mtcars[2,3] # extract element in the second row and third column
## [1] 160
mtcars[2:3,] #extract second and third row
mtcars[c(2,3),]
mtcars[-c(1:5),] # all rows except 1:5

Plot

plot(mtcars$wt, mtcars$mpg, xlab = "Wt.", ylab = 'MPG') #scatterplot

Lab 2

data_url <- "https://github.com/ericwfox/stat630data/raw/master/cdc.csv" #string for cdc data 
cdc <- read.csv(data_url, header = TRUE)
names(cdc)
## [1] "genhlth"  "exerany"  "hlthplan" "smoke100" "height"   "weight"   "wtdesire"
## [8] "age"      "gender"
dim(cdc)
## [1] 20000     9
summary(cdc$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    68.0   140.0   165.0   169.7   190.0   500.0
sd(cdc$weight) # standard deviation
## [1] 40.08097
table(cdc$smoke100) # for catergorical variable
## 
##     0     1 
## 10559  9441
round(table(cdc$smoke100)/20000,2) # relative frequency, at two decimal point
## 
##    0    1 
## 0.53 0.47
barplot(table(cdc$smoke100))

addmargins(table(cdc$gender, cdc$smoke100)) #add margins for row and column totals
##      
##           0     1   Sum
##   f    6012  4419 10431
##   m    4547  5022  9569
##   Sum 10559  9441 20000

subset

cdc10 <- cdc[1:10,]
cdc10$gender
##  [1] m f f f f f m m f m
## Levels: f m
subset(cdc10, gender =='m') #filter males
subset(cdc10,  gender =='m' & weight>140) # filter male and  >140
subset(cdc10,  gender =='m' | weight>140) # filter male or  >140
cdc_m <- subset(cdc, gender =='m') # males only
summary(cdc_m$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    78.0   165.0   185.0   189.3   210.0   500.0
cdc_m_over50 <- subset(cdc, gender == 'm' & age >50) # males > 50 yrs
summary(cdc_m_over50$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    78.0   165.0   185.0   188.6   208.0   400.0
plot(cdc$weight, cdc$wtdesire, xlab = "WT.", ylab = 'Desired WT.', cex = 0.6, abline(0,1)) 

# cex size of dot and abline is line

Lab 3

table(cdc$smoke100) # frequency table
## 
##     0     1 
## 10559  9441
table(cdc$smoke100)/20000 # relative ft
## 
##       0       1 
## 0.52795 0.47205
barplot(table(cdc$smoke100), xlab = "smoked at least 100 cigarettes", names.arg = c("no","yes"), ylab = "count")

barplot(table(cdc$smoke100)/20000, xlab = "smoked at least 100 cigarettes", names.arg = c("no","yes"), ylab = "proportion")

addmargins(table(cdc$smoke100, cdc$gender)) # contingency table
##      
##           f     m   Sum
##   0    6012  4547 10559
##   1    4419  5022  9441
##   Sum 10431  9569 20000
#segmented bar plot
barplot(table(cdc$smoke100, cdc$gender), col = c("skyblue","lightgreen"), beside = TRUE)

#stacked bar plot
barplot(table(cdc$smoke100, cdc$gender), col = c("skyblue","lightgreen"))

# adding a legend
barplot(table(cdc$smoke100, cdc$gender), col = c("skyblue","lightgreen"), beside = TRUE, ylab = 'counts', ylim = c(0,7000))
legend(x = "topright", c('no', 'yes'), col = c("skyblue","lightgreen"), title = 'smokes', pch =18)

prop.table(table(cdc$smoke100, cdc$gender))
##    
##           f       m
##   0 0.30060 0.22735
##   1 0.22095 0.25110
table(cdc$smoke100, cdc$gender)/20000 # same as prop.table
##    
##           f       m
##   0 0.30060 0.22735
##   1 0.22095 0.25110
#Factors
class(cdc$genhlth)
## [1] "factor"
levels(cdc$genhlth) #factors levels or names
## [1] "excellent" "fair"      "good"      "poor"      "very good"
table(cdc$genhlth)
## 
## excellent      fair      good      poor very good 
##      4657      2019      5675       677      6972
cdc$genhlth <- factor(cdc$genhlth, levels = c("poor", "fair", "good", "very good", "excellent")) # reorder labels
table(cdc$genhlth)
## 
##      poor      fair      good very good excellent 
##       677      2019      5675      6972      4657
barplot(table(cdc$genhlth), xlab = "general health", ylab = "count", col = "green")                                          

# Histogram, for continuous variable
sort(mtcars$mpg)
##  [1] 10.4 10.4 13.3 14.3 14.7 15.0 15.2 15.2 15.5 15.8 16.4 17.3 17.8 18.1 18.7
## [16] 19.2 19.2 19.7 21.0 21.0 21.4 21.4 21.5 22.8 22.8 24.4 26.0 27.3 30.4 30.4
## [31] 32.4 33.9
hist(mtcars$mpg, main = "Histogram of MPG", xlab = "Miles per Gallon")

# Attributes
h1 <- hist(mtcars$mpg)

attributes(h1)
## $names
## [1] "breaks"   "counts"   "density"  "mids"     "xname"    "equidist"
## 
## $class
## [1] "histogram"
h1$density
## [1] 0.0375 0.0750 0.0500 0.0125 0.0250
h1$breaks
## [1] 10 15 20 25 30 35
h1$counts
## [1]  6 12  8  2  4
hist(cdc$weight) # default bins

hist(cdc$weight, breaks = 50, main = "50 bins")

# Histogram Density plot, freq = F
hist(mtcars$mpg, freq = F, main = "Density Plot of MPG")

hist(cdc$weight, freq = F)
lines(density(cdc$weight), col = "red") # density() to superimpose a smooth #density curve over the hist

# Box Plots
boxplot(cdc$weight, ylab = "Weight")

boxplot(cdc$weight~cdc$gender)

Maps

library(maps)
map("world")

map("state","california")

map("county","ca")

#EPA Stream Data Set
data_url <- "https://raw.githubusercontent.com/ericwfox/stat630data/master/nrsa.csv"
nrsa <- read.csv(data_url, header = TRUE)
head(nrsa)
map("state")
points(nrsa$lon, nrsa$lat, cex = 0.5)

map("state")
points(nrsa$lon, nrsa$lat, cex = 0.5,  col = c("red","green","blue")[nrsa$cond2])
legend("bottomright", c("poor", "fair", "good"),
col=c("red","green","blue"), pch=1)

nrsa_good = subset(nrsa, cond=="Good")
map("state")
points(nrsa_good$lon, nrsa_good$lat, col= "red"
, cex=0.5)

Lab 4: Normal Distribution in R

pnorm(1.35) # to compute prob P(Z<1.35)
## [1] 0.911492
qnorm(0.99) # to compute z values
## [1] 2.326348
hist(rnorm(100)) # generate 100 random numbers and plot hist

hist(rnorm(100, 10,3)) # generate 10 random numbers from N(10,3)

# plot normal density function(pdf) using dnorm()
grd <- seq(-4, 4, by = 0.01)
y <- dnorm(grd)
plot(grd,y, type = 'l', xlab = 'x', ylab = 'f(x)', main = 'N(0,1)') # type = 'l' for lines

Assessing Norlality

# histogram density with normal dist curve
cdc_male <- subset(cdc, gender =='m')
hist(cdc_male$height, breaks = 50, prob =T, xlab = " Male Heights")
grd <- seq(50,90,0.01)
y <- dnorm(grd, mean = mean(cdc_male$height), sd = sd(cdc_male$height))
lines(grd, y, col = "darkgreen")

# QQ plot
qqnorm(cdc_male$height) # normal prob plot
qqline(cdc_male$height)  # add line for reference

#Generate n = 1000 random numbers from N(0; 1) and plot the density histogram and normal probability plot.
par(mfrow = c(1,2)) # so the plots are next to each other
sim_norm1000 <- rnorm(1000)
hist(sim_norm1000, prob=T)
grd <- seq(-3, 3, 0.01)
y <- dnorm(grd, mean=mean(sim_norm1000), sd=sd(sim_norm1000))
lines(grd, y, col='red')

qqnorm(sim_norm1000)
qqline(sim_norm1000)

#Generate 1000 random numbers from a normal distribution with mean  = 100 andstandard deviation  = 20, and then plot the histogram. Also make a normal probability plot withthe random numbers you generated.

par(mfrow=c(1,2)) # so that plots are beside each other
set.seed(100)
p = rnorm(1000, mean=100, sd=20)
hist(p, main="N(100, 20)")
qqnorm(p)

Lab 5: Sampling Distribution

#CDC Data
summary(cdc$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    68.0   140.0   165.0   169.7   190.0   500.0
par(mfrow = c(1,2))
hist(cdc$weight)
qqnorm(cdc$weight)
qqline(cdc$weight)

# 5000 samples
set.seed(999)
xbars <- rep(NA, 5000)
for(i in 1:5000) {
samp_i <- sample(cdc$weight, 30)
xbars[i] <- mean(samp_i)
}

par(mfrow=c(1,2), cex=0.6)
hist(xbars, xlab= "Mean weight", main=
"Histogram of Sample Means (n=30)")
qqnorm(xbars)
qqline(xbars)

Lab 6 : Confidence Interval

#CDC Data
mu <- mean(cdc$weight)
round(mu,2)
## [1] 169.68
# 95% CI
sample1 <- sample(cdc$weight, 30)
round(qnorm(0.975),2) # z-critical value
## [1] 1.96
ci_lower <- mean(sample1) - 1.96 * sd(sample1)/sqrt(30)
ci_upper <- mean(sample1) + 1.96 * sd(sample1)/sqrt(30)
round(c(ci_lower,ci_upper),2)
## [1] 156.58 188.55