getwd()
## [1] "/Users/nishitatamuly/Desktop/HU/ANLY 545 - R/R"
setwd("/Users/nishitatamuly/Desktop/HU/ANLY 545 - R/R")
getwd()
## [1] "/Users/nishitatamuly/Desktop/HU/ANLY 545 - R/R"
##install.packages("HistData")
library(HistData)
library(magrittr)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(vcd)
## Loading required package: grid
Make a plot of Ratio over Year. What features stand out? Which plot do you prefer to display the tendency for more male births?
maxRatio=max(Arbuthnot$Ratio)
minRatio=min(Arbuthnot$Ratio)
meanRatio=mean(Arbuthnot$Ratio)
plot_ly(Arbuthnot, x = ~Year, y = ~Ratio, name = 'Ratio: Male vs. Female',type = 'scatter', mode = 'lines',line = list(color = 'purple', width = 1.5))%>%
add_trace(y = ~maxRatio, name = 'Maximum', mode = 'lines',line = list(color = 'green', width = 2)) %>%
add_trace(y = ~minRatio, name = 'Minimum', mode = 'lines',line = list(color = 'red', width = 2)) %>%
add_trace(y = ~meanRatio, name = 'Mean', mode = 'lines',line = list(color = 'black', width = 2))
We see from the plot the ratio of Male:Femal births is above the mean in most cases displaying that there are more male births on average.
Plot the total number of christenings, Males + Females or Total (in 000s) over time. What unusual features do you see?
maximum=max(Arbuthnot$Total)
minimum=min(Arbuthnot$Total)
plot_ly(Arbuthnot, x = ~Year, y = ~Total, name = 'Total Number of Christenings (in 000s))', type = 'scatter', mode = 'line',line = list(color = 'blue', width = 1.5)) %>%
add_trace(y = ~maximum, name = 'Max Christening', line = list(color = 'green', width = 2)) %>%
add_trace(y = ~minimum, name = 'Min Christening', line = list(color = 'red', width = 2)) %>%
layout(xaxis = list(title = "Year"),
yaxis = list (title = "Total Number of Christenings (in 000s)"))
## A line object has been specified, but lines is not in the mode
## Adding lines to the mode...
## A line object has been specified, but lines is not in the mode
## Adding lines to the mode...
## A line object has been specified, but lines is not in the mode
## Adding lines to the mode...
Although christenings increase over time, there is a huge / drastic drop in christenings in the 1640s as well as in 1704. We know there was a drop in births but could there be a historical or cultutral reason in that period for no christenings?
Make a reasonable plot showing departure from the binomial distribution for WomenQueue
Fit1= goodfit(WomenQueue, type = "binomial", par = list(size = 10))
plot(Fit1,type = "hanging")
Suggest some reasons why the number of women in queues of length 10 might depart from a binomial distribution, Bin(n = 10, p = 1/2).
Fit2= goodfit(WomenQueue, type = "binomial", par = list(prob = .5,size = 10))
plot(Fit2,type = "hanging", shade = TRUE)
## Warning in max.c * pmax(pmin(interpolate(abs(res)), 1), 0): longer object
## length is not a multiple of shorter object length
## Warning in max.c * pmax(pmin(interpolate(abs(res)), 1), 0): longer object
## length is not a multiple of shorter object length
distplot(WomenQueue, type = c("binomial"), conf_level = 0.95)
## Warning in distplot(WomenQueue, type = c("binomial"), conf_level = 0.95):
## size was not given, taken as maximum count
There are 0 for n=10 which maybe the reason they are ignored from binomial distribution
Carry out the GOF test for this fixed binomial distribution. What is the ratio of χ2/df? What do you conclude?
data("Saxony",package="vcd")
GOF=goodfit(Saxony,type="binomial",par=list(size=12))
GOF
##
## Observed and fitted values for binomial distribution
## with parameters estimated by `ML'
##
## count observed fitted pearson residual
## 0 3 0.9328394 2.1402809
## 1 24 12.0888374 3.4257991
## 2 104 71.8031709 3.7996298
## 3 286 258.4751335 1.7120476
## 4 670 628.0550119 1.6737139
## 5 1033 1085.2107008 -1.5849023
## 6 1343 1367.2793552 -0.6566116
## 7 1112 1265.6303069 -4.3184059
## 8 829 854.2466464 -0.8637977
## 9 478 410.0125627 3.3576088
## 10 181 132.8357027 4.1789562
## 11 45 26.0824586 3.7041659
## 12 7 2.3472734 3.0368664
Test the additional lack of fit for the model 𝐵𝑖𝑛(𝑛 = 12, 𝑝 = 1) compared to themodel 𝐵𝑖𝑛(𝑛 = 12, 𝑝 = 𝑝̂) where 𝑝̂ is estimated from the data.
GOF2=goodfit(Saxony,type="binomial",par=list(prob=.5,size=12))
GOF2
##
## Observed and fitted values for binomial distribution
## with fixed parameters
##
## count observed fitted pearson residual
## 0 3 1.49292 1.2334401
## 1 24 17.91504 1.4376359
## 2 104 98.53271 0.5507842
## 3 286 328.44238 -2.3419098
## 4 670 738.99536 -2.5380434
## 5 1033 1182.39258 -4.3445838
## 6 1343 1379.45801 -0.9816094
## 7 1112 1182.39258 -2.0471328
## 8 829 738.99536 3.3108845
## 9 478 328.44238 8.2523747
## 10 181 98.53271 8.3079041
## 11 45 17.91504 6.3991064
## 12 7 1.49292 4.5071617
Use the plot(goodfit ()) method to visualize these two models.
plot(GOF,xlab="Ocurrences",ylab="Frequency",main="Families in Saxony: GOF (size=12)")
plot(GOF2,xlab="Ocurrences",ylab="Frequency",main="Families in Saxony: GOF (size=12,p=.5)")
Goodness of fit with P=.5 shows a very slightly higher frequency for the first half of ocurrences and a lower frequency for the second half.