Introduction to Statistics with R

Lab 1

Indexing

x <- c(2,3, 6,4, 7,4, 100)
x[c(1,4)]

## [1] 2 4

#drop element
x[-1]

## [1]   3   6   4   7   4 100

#change element of vector
x[2]<- 999
x

## [1]   2 999   6   4   7   4 100

Data Frames

nrow(mtcars) # number of rows

## [1] 32

ncol(mtcars) # number of columns

## [1] 11

dim(mtcars)

## [1] 32 11

names(mtcars) # names of the variable or columns

##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

row.names(mtcars) # names of the rows

##  [1] "Mazda RX4"           "Mazda RX4 Wag"       "Datsun 710"         
##  [4] "Hornet 4 Drive"      "Hornet Sportabout"   "Valiant"            
##  [7] "Duster 360"          "Merc 240D"           "Merc 230"           
## [10] "Merc 280"            "Merc 280C"           "Merc 450SE"         
## [13] "Merc 450SL"          "Merc 450SLC"         "Cadillac Fleetwood" 
## [16] "Lincoln Continental" "Chrysler Imperial"   "Fiat 128"           
## [19] "Honda Civic"         "Toyota Corolla"      "Toyota Corona"      
## [22] "Dodge Challenger"    "AMC Javelin"         "Camaro Z28"         
## [25] "Pontiac Firebird"    "Fiat X1-9"           "Porsche 914-2"      
## [28] "Lotus Europa"        "Ford Pantera L"      "Ferrari Dino"       
## [31] "Maserati Bora"       "Volvo 142E"

min(mtcars$mpg) #min of mpg

## [1] 10.4

mean(mtcars$mpg)

## [1] 20.09062

mtcars[,1] # extract first column and all rows

##  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
## [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
## [31] 15.0 21.4

mtcars[1,] #extract first row and all column

mtcars['Datsun 710',] # extract row for Datsun 710

mtcars[, 'mpg']

##  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
## [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
## [31] 15.0 21.4

mtcars[2,3] # extract element in the second row and third column

## [1] 160

mtcars[2:3,] #extract second and third row

mtcars[c(2,3),]

mtcars[-c(1:5),] # all rows except 1:5

Plot

plot(mtcars$wt, mtcars$mpg, xlab = "Wt.", ylab = 'MPG') #scatterplot

Lab 2

data_url <- "https://github.com/ericwfox/stat630data/raw/master/cdc.csv" #string for cdc data 
cdc <- read.csv(data_url, header = TRUE)
names(cdc)

## [1] "genhlth"  "exerany"  "hlthplan" "smoke100" "height"   "weight"   "wtdesire"
## [8] "age"      "gender"

dim(cdc)

## [1] 20000     9

summary(cdc$weight)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    68.0   140.0   165.0   169.7   190.0   500.0

sd(cdc$weight) # standard deviation

## [1] 40.08097

table(cdc$smoke100) # for catergorical variable

## 
##     0     1 
## 10559  9441

round(table(cdc$smoke100)/20000,2) # relative frequency, at two decimal point

## 
##    0    1 
## 0.53 0.47

barplot(table(cdc$smoke100))

addmargins(table(cdc$gender, cdc$smoke100)) #add margins for row and column totals

##      
##           0     1   Sum
##   f    6012  4419 10431
##   m    4547  5022  9569
##   Sum 10559  9441 20000

subset

cdc10 <- cdc[1:10,]
cdc10$gender

##  [1] m f f f f f m m f m
## Levels: f m

subset(cdc10, gender =='m') #filter males

subset(cdc10,  gender =='m' & weight>140) # filter male and  >140

subset(cdc10,  gender =='m' | weight>140) # filter male or  >140

cdc_m <- subset(cdc, gender =='m') # males only
summary(cdc_m$weight)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    78.0   165.0   185.0   189.3   210.0   500.0

cdc_m_over50 <- subset(cdc, gender == 'm' & age >50) # males > 50 yrs
summary(cdc_m_over50$weight)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    78.0   165.0   185.0   188.6   208.0   400.0

plot(cdc$weight, cdc$wtdesire, xlab = "WT.", ylab = 'Desired WT.', cex = 0.6, abline(0,1))

# cex size of dot and abline is line

Lab 3

table(cdc$smoke100) # frequency table

## 
##     0     1 
## 10559  9441

table(cdc$smoke100)/20000 # relative ft

## 
##       0       1 
## 0.52795 0.47205

barplot(table(cdc$smoke100), xlab = "smoked at least 100 cigarettes", names.arg = c("no","yes"), ylab = "count")

barplot(table(cdc$smoke100)/20000, xlab = "smoked at least 100 cigarettes", names.arg = c("no","yes"), ylab = "proportion")

addmargins(table(cdc$smoke100, cdc$gender)) # contingency table

##      
##           f     m   Sum
##   0    6012  4547 10559
##   1    4419  5022  9441
##   Sum 10431  9569 20000

#segmented bar plot
barplot(table(cdc$smoke100, cdc$gender), col = c("skyblue","lightgreen"), beside = TRUE)

#stacked bar plot
barplot(table(cdc$smoke100, cdc$gender), col = c("skyblue","lightgreen"))

# adding a legend
barplot(table(cdc$smoke100, cdc$gender), col = c("skyblue","lightgreen"), beside = TRUE, ylab = 'counts', ylim = c(0,7000))
legend(x = "topright", c('no', 'yes'), col = c("skyblue","lightgreen"), title = 'smokes', pch =18)

prop.table(table(cdc$smoke100, cdc$gender))

##    
##           f       m
##   0 0.30060 0.22735
##   1 0.22095 0.25110

table(cdc$smoke100, cdc$gender)/20000 # same as prop.table

##    
##           f       m
##   0 0.30060 0.22735
##   1 0.22095 0.25110

#Factors
class(cdc$genhlth)

## [1] "factor"

levels(cdc$genhlth) #factors levels or names

## [1] "excellent" "fair"      "good"      "poor"      "very good"

table(cdc$genhlth)

## 
## excellent      fair      good      poor very good 
##      4657      2019      5675       677      6972

cdc$genhlth <- factor(cdc$genhlth, levels = c("poor", "fair", "good", "very good", "excellent")) # reorder labels
table(cdc$genhlth)

## 
##      poor      fair      good very good excellent 
##       677      2019      5675      6972      4657

barplot(table(cdc$genhlth), xlab = "general health", ylab = "count", col = "green")

# Histogram, for continuous variable
sort(mtcars$mpg)

##  [1] 10.4 10.4 13.3 14.3 14.7 15.0 15.2 15.2 15.5 15.8 16.4 17.3 17.8 18.1 18.7
## [16] 19.2 19.2 19.7 21.0 21.0 21.4 21.4 21.5 22.8 22.8 24.4 26.0 27.3 30.4 30.4
## [31] 32.4 33.9

hist(mtcars$mpg, main = "Histogram of MPG", xlab = "Miles per Gallon")

# Attributes
h1 <- hist(mtcars$mpg)

attributes(h1)

## $names
## [1] "breaks"   "counts"   "density"  "mids"     "xname"    "equidist"
## 
## $class
## [1] "histogram"

h1$density

## [1] 0.0375 0.0750 0.0500 0.0125 0.0250

h1$breaks

## [1] 10 15 20 25 30 35

h1$counts

## [1]  6 12  8  2  4

hist(cdc$weight) # default bins

hist(cdc$weight, breaks = 50, main = "50 bins")

# Histogram Density plot, freq = F
hist(mtcars$mpg, freq = F, main = "Density Plot of MPG")

hist(cdc$weight, freq = F)
lines(density(cdc$weight), col = "red") # density() to superimpose a smooth #density curve over the hist

# Box Plots
boxplot(cdc$weight, ylab = "Weight")

boxplot(cdc$weight~cdc$gender)

Maps

library(maps)
map("world")

map("state","california")

map("county","ca")

#EPA Stream Data Set
data_url <- "https://raw.githubusercontent.com/ericwfox/stat630data/master/nrsa.csv"
nrsa <- read.csv(data_url, header = TRUE)

head(nrsa)

map("state")
points(nrsa$lon, nrsa$lat, cex = 0.5)

map("state")
points(nrsa$lon, nrsa$lat, cex = 0.5,  col = c("red","green","blue")[nrsa$cond2])
legend("bottomright", c("poor", "fair", "good"),
col=c("red","green","blue"), pch=1)

nrsa_good = subset(nrsa, cond=="Good")
map("state")
points(nrsa_good$lon, nrsa_good$lat, col= "red"
, cex=0.5)

Lab 4: Normal Distribution in R

pnorm(1.35) # to compute prob P(Z<1.35)

## [1] 0.911492

qnorm(0.99) # to compute z values

## [1] 2.326348

hist(rnorm(100)) # generate 100 random numbers and plot hist

hist(rnorm(100, 10,3)) # generate 10 random numbers from N(10,3)

# plot normal density function(pdf) using dnorm()
grd <- seq(-4, 4, by = 0.01)
y <- dnorm(grd)
plot(grd,y, type = 'l', xlab = 'x', ylab = 'f(x)', main = 'N(0,1)') # type = 'l' for lines

Assessing Norlality

# histogram density with normal dist curve
cdc_male <- subset(cdc, gender =='m')
hist(cdc_male$height, breaks = 50, prob =T, xlab = " Male Heights")
grd <- seq(50,90,0.01)
y <- dnorm(grd, mean = mean(cdc_male$height), sd = sd(cdc_male$height))
lines(grd, y, col = "darkgreen")

# QQ plot
qqnorm(cdc_male$height) # normal prob plot
qqline(cdc_male$height)  # add line for reference

#Generate n = 1000 random numbers from N(0; 1) and plot the density histogram and normal probability plot.
par(mfrow = c(1,2)) # so the plots are next to each other
sim_norm1000 <- rnorm(1000)
hist(sim_norm1000, prob=T)
grd <- seq(-3, 3, 0.01)
y <- dnorm(grd, mean=mean(sim_norm1000), sd=sd(sim_norm1000))
lines(grd, y, col='red')

qqnorm(sim_norm1000)
qqline(sim_norm1000)

#Generate 1000 random numbers from a normal distribution with mean  = 100 andstandard deviation  = 20, and then plot the histogram. Also make a normal probability plot withthe random numbers you generated.

par(mfrow=c(1,2)) # so that plots are beside each other
set.seed(100)
p = rnorm(1000, mean=100, sd=20)
hist(p, main="N(100, 20)")
qqnorm(p)

Lab 5: Sampling Distribution

#CDC Data
summary(cdc$weight)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    68.0   140.0   165.0   169.7   190.0   500.0

par(mfrow = c(1,2))
hist(cdc$weight)
qqnorm(cdc$weight)
qqline(cdc$weight)

# 5000 samples
set.seed(999)
xbars <- rep(NA, 5000)
for(i in 1:5000) {
samp_i <- sample(cdc$weight, 30)
xbars[i] <- mean(samp_i)
}

par(mfrow=c(1,2), cex=0.6)
hist(xbars, xlab= "Mean weight", main=
"Histogram of Sample Means (n=30)")
qqnorm(xbars)
qqline(xbars)

Lab 6 : Confidence Interval

#CDC Data
mu <- mean(cdc$weight)
round(mu,2)

## [1] 169.68

# 95% CI
sample1 <- sample(cdc$weight, 30)
round(qnorm(0.975),2) # z-critical value

## [1] 1.96

ci_lower <- mean(sample1) - 1.96 * sd(sample1)/sqrt(30)
ci_upper <- mean(sample1) + 1.96 * sd(sample1)/sqrt(30)
round(c(ci_lower,ci_upper),2)

## [1] 156.58 188.55

STAT 630

Min Tamang

12/9/2018

Introduction to Statistics with R

Lab 1

Lab 2

Lab 3

Lab 4: Normal Distribution in R

Lab 5: Sampling Distribution

Lab 6 : Confidence Interval