getwd()
## [1] "/Users/nishitatamuly/Desktop/HU/ANLY 545 - R/R"
setwd("/Users/nishitatamuly/Desktop/HU/ANLY 545 - R/R")
getwd()
## [1] "/Users/nishitatamuly/Desktop/HU/ANLY 545 - R/R"
##install.packages("HistData")
library(HistData)
library(magrittr)
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(vcd)
## Loading required package: grid

Make a plot of Ratio over Year. What features stand out? Which plot do you prefer to display the tendency for more male births?

maxRatio=max(Arbuthnot$Ratio)
minRatio=min(Arbuthnot$Ratio)
meanRatio=mean(Arbuthnot$Ratio)

plot_ly(Arbuthnot, x = ~Year, y = ~Ratio, name = 'Ratio: Male vs. Female',type = 'scatter', mode = 'lines',line = list(color = 'purple', width = 1.5))%>%
  add_trace(y = ~maxRatio, name = 'Maximum', mode = 'lines',line = list(color = 'green', width = 2)) %>%
add_trace(y = ~minRatio, name = 'Minimum', mode = 'lines',line = list(color = 'red', width = 2)) %>%
add_trace(y = ~meanRatio, name = 'Mean', mode = 'lines',line = list(color = 'black', width = 2))

We see from the plot the ratio of Male:Femal births is above the mean in most cases displaying that there are more male births on average.

Plot the total number of christenings, Males + Females or Total (in 000s) over time. What unusual features do you see?

maximum=max(Arbuthnot$Total) 
minimum=min(Arbuthnot$Total)

plot_ly(Arbuthnot, x = ~Year, y = ~Total, name = 'Total Number of Christenings (in 000s))', type = 'scatter', mode = 'line',line = list(color = 'blue', width = 1.5)) %>%
add_trace(y = ~maximum, name = 'Max Christening', line = list(color = 'green', width = 2)) %>%
add_trace(y = ~minimum, name = 'Min Christening', line = list(color = 'red', width = 2)) %>%
layout(xaxis = list(title = "Year"), 
       yaxis = list (title = "Total Number of Christenings (in 000s)"))
## A line object has been specified, but lines is not in the mode
## Adding lines to the mode...
## A line object has been specified, but lines is not in the mode
## Adding lines to the mode...
## A line object has been specified, but lines is not in the mode
## Adding lines to the mode...

Although christenings increase over time, there is a huge / drastic drop in christenings in the 1640s as well as in 1704. We know there was a drop in births but could there be a historical or cultutral reason in that period for no christenings?

Make a reasonable plot showing departure from the binomial distribution for WomenQueue

Fit1= goodfit(WomenQueue, type = "binomial", par = list(size = 10))
plot(Fit1,type = "hanging")

Suggest some reasons why the number of women in queues of length 10 might depart from a binomial distribution, Bin(n = 10, p = 1/2).

Fit2= goodfit(WomenQueue, type = "binomial", par = list(prob = .5,size = 10))
plot(Fit2,type = "hanging", shade = TRUE)
## Warning in max.c * pmax(pmin(interpolate(abs(res)), 1), 0): longer object
## length is not a multiple of shorter object length

## Warning in max.c * pmax(pmin(interpolate(abs(res)), 1), 0): longer object
## length is not a multiple of shorter object length

distplot(WomenQueue, type = c("binomial"), conf_level = 0.95)
## Warning in distplot(WomenQueue, type = c("binomial"), conf_level = 0.95):
## size was not given, taken as maximum count

There are 0 for n=10 which maybe the reason they are ignored from binomial distribution

Carry out the GOF test for this fixed binomial distribution. What is the ratio of χ2/df? What do you conclude?

data("Saxony",package="vcd")
GOF=goodfit(Saxony,type="binomial",par=list(size=12))
GOF
## 
## Observed and fitted values for binomial distribution
## with parameters estimated by `ML' 
## 
##  count observed       fitted pearson residual
##      0        3    0.9328394        2.1402809
##      1       24   12.0888374        3.4257991
##      2      104   71.8031709        3.7996298
##      3      286  258.4751335        1.7120476
##      4      670  628.0550119        1.6737139
##      5     1033 1085.2107008       -1.5849023
##      6     1343 1367.2793552       -0.6566116
##      7     1112 1265.6303069       -4.3184059
##      8      829  854.2466464       -0.8637977
##      9      478  410.0125627        3.3576088
##     10      181  132.8357027        4.1789562
##     11       45   26.0824586        3.7041659
##     12        7    2.3472734        3.0368664

Test the additional lack of fit for the model 𝐵𝑖𝑛(𝑛 = 12, 𝑝 = 1) compared to themodel 𝐵𝑖𝑛(𝑛 = 12, 𝑝 = 𝑝̂) where 𝑝̂ is estimated from the data.

GOF2=goodfit(Saxony,type="binomial",par=list(prob=.5,size=12))
GOF2
## 
## Observed and fitted values for binomial distribution
## with fixed parameters 
## 
##  count observed     fitted pearson residual
##      0        3    1.49292        1.2334401
##      1       24   17.91504        1.4376359
##      2      104   98.53271        0.5507842
##      3      286  328.44238       -2.3419098
##      4      670  738.99536       -2.5380434
##      5     1033 1182.39258       -4.3445838
##      6     1343 1379.45801       -0.9816094
##      7     1112 1182.39258       -2.0471328
##      8      829  738.99536        3.3108845
##      9      478  328.44238        8.2523747
##     10      181   98.53271        8.3079041
##     11       45   17.91504        6.3991064
##     12        7    1.49292        4.5071617

Use the plot(goodfit ()) method to visualize these two models.

plot(GOF,xlab="Ocurrences",ylab="Frequency",main="Families in Saxony: GOF (size=12)")

plot(GOF2,xlab="Ocurrences",ylab="Frequency",main="Families in Saxony: GOF (size=12,p=.5)")

Goodness of fit with P=.5 shows a very slightly higher frequency for the first half of ocurrences and a lower frequency for the second half.