2.6

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

#a)  p(sum=1) = 0, since sum must be bigger than 1.
0

## [1] 0

#b) (sum=5) = [(1,4) + (4,1)] + [(2,3) + (3,2)] = 4, p(sum=5) = 4/36
4/36

## [1] 0.1111111

#c) (sum=12) = [(6,6)] = 1, p(sum=12) = 1/36
1/36

## [1] 0.02777778

2.8

#a) No, because both events can happen at the same time. 4.2% fall into both categories.
#b)
library(VennDiagram)

## Warning: package 'VennDiagram' was built under R version 3.4.3

## Loading required package: grid

## Loading required package: futile.logger

## Warning: package 'futile.logger' was built under R version 3.4.3

poverty <- 14.6
forlanguage <- 20.7
both <- 4.2
povEng <- poverty - both
forlanguageOnly <- forlanguage - both

venn.plot <- draw.pairwise.venn(poverty, 
                                forlanguage,
                                cross.area=both, 
                                c("Poverty", "Foreign language"), 
                                fill=c("yellow", "lightblue"),
                                cat.dist=-0.08,
                                ind=FALSE)
grid.draw(venn.plot)

#c)
povEng

## [1] 10.4

#d)
poverty + forlanguage - both

## [1] 31.1

#e)
#English Only - English Only in Poverty = English Only above poverty
(100 - forlanguage) - (povEng)

## [1] 68.9

#f)
#It is not independent since p(pov and forlanguage) is not equal to p(both).

poverty/100 * forlanguage/100

## [1] 0.030222

both/100

## [1] 0.042

2.20

#a)
#p(male is blue) + p(female is blue) - p(both are blue)
114/204 + 108/204 - 78/204

## [1] 0.7058824

#b)
#p(female is blue|male is blue), assuming that the question is asking for conditional probability
78/114

## [1] 0.6842105

#c)
#p(female is blue|male is brown), assuming that the question is asking for conditional probability
19/54

## [1] 0.3518519

#p(female is blue|male is green), assuming that the question is asking for conditional probability
11/36

## [1] 0.3055556

#d)
#Proportionately speaking, looking at the table, it seems apparent that male tend to prefer partner with same eye color. Therefore, the eyecorlors of male and female are not independent.

2.30

#2.30
hc <- c(13,15)
pb <- c(59,8)
total <- c(72,23)
df <- data.frame(hc, pb, total)

sum <- c(sum(df[,"hc"]),sum(df[,"pb"]),sum(df[,"total"]))
df <- rbind(df,sum)
row.names(df) <- c("fic", "non-fic", "total")

df

##         hc pb total
## fic     13 59    72
## non-fic 15  8    23
## total   28 67    95

#a) 
#p(hc) * p(pb fiction)  -- without replacement
28/95 * 59/94

## [1] 0.1849944

#b)
#p(fic) * p(hc)  -- without replacement
72/95 * 28/94

## [1] 0.2257559

#c)
#p(fic) * p(hc)  -- with replacement
72/95 * 28/95

## [1] 0.2233795

#d)
#The answers are similar to each other because the difference is just 1 book (94 vs 95). The more you draw a book without replacement, the larger difference you would get. (1 vs 2 vs 3 vs ..... vs 95)

2.38

#a)
prob <- c(0.54, 0.34, 0.12)
bags <- c(0, 1, 2)
fees <- c(0, 25, 25 + 35)
exp.value <- prob * fees
avg.rev.pp <- sum(exp.value)
diff.mean <- fees - avg.rev.pp
diff.mean.sqr <- (diff.mean)^2
diff.mean.sqrTimesProb <- diff.mean.sqr * prob
var.rev.pp <- sum(diff.mean.sqrTimesProb)


df <- rbind(bags,prob,fees,exp.value,diff.mean,diff.mean.sqr,diff.mean.sqrTimesProb)

std.rev.pp <- sqrt(var.rev.pp)

#average revenue per pessenger
avg.rev.pp

## [1] 15.7

#corresponding standard deviation
std.rev.pp

## [1] 19.95019

#b)
revenue.passenger.upper <- (avg.rev.pp * 120) + std.rev.pp
revenue.passenger.lower <- (avg.rev.pp * 120) - std.rev.pp

revenue.passenger.upper

## [1] 1903.95

revenue.passenger.lower

## [1] 1864.05

#You are expecting [$1864 to $1904]

2.44

income <- c("$1 - $9,999 or loss", 
            "$10,000 to $14,999", 
            "$15,000 to $24,999",
            "$25,000 to $34,999",
            "$35,000 to $49,999",
            "$50,000 to $64,000",
            "$65,000 to $74,999",
            "$75,000 to $99,999",
            "$100,000 or more")
bounds <- c(1, 10000, 15000, 25000, 35000, 50000, 65000, 75000, 100000)
size <- c(9999, 4999, 9999, 9999, 14999, 14999, 9999, 24999, 99999)
center <- bounds + (size / 2)
total <- c(0.022, 0.047, 0.158, 0.183, 0.212, 0.139, 0.058, 0.084, 0.097)

df2 <- data.frame(income, center, total)
df2

##                income   center total
## 1 $1 - $9,999 or loss   5000.5 0.022
## 2  $10,000 to $14,999  12499.5 0.047
## 3  $15,000 to $24,999  19999.5 0.158
## 4  $25,000 to $34,999  29999.5 0.183
## 5  $35,000 to $49,999  42499.5 0.212
## 6  $50,000 to $64,000  57499.5 0.139
## 7  $65,000 to $74,999  69999.5 0.058
## 8  $75,000 to $99,999  87499.5 0.084
## 9    $100,000 or more 149999.5 0.097

total <- c(0.022,0.047,0.158,0.183,0.212,0.139,0.058,0.084,0.097)

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.3

graph <- ggplot(data=df2) + 
  geom_bar(aes(x=center, y=total, width=size), stat='identity', position="identity") + 
  labs(x="Income ($)", 
       y="Relative Frequency", 
       title="Survey for 2005-2009")

## Warning: Ignoring unknown aesthetics: width

#It is bimodal that peaks around $35K to $50K and for $100K+.   
graph

#b)
#p(less 50k)
less_50k_prob <- sum(df2[1:5,]$total)

#c)
f_prob <- 0.41
#p(less 50k) * p(female)
less_50k_prob * f_prob

## [1] 0.25502

#d)
f_prob_data_less50k <- 0.718

#the value is quite different from c). p(less 50k|female) * p(female)
f_prob_data_less50k * f_prob

## [1] 0.29438

#Check whether P(less 50k and female) =  p(less 50k) * p(female) = p(less 50k|female) * p(female)
less_50k_prob * f_prob

## [1] 0.25502

f_prob_data_less50k * f_prob

## [1] 0.29438

#Since less_50k_prob is not equalt to f_prob_data_less50k, we can say making less than 50k and being female are not independent events.

Ch.2. Data606. HW#2

Sang Yoon (Andy) Hwang

February 16, 2018

2.6

2.8

2.20

2.30

2.38

2.44