Theme Song
# install.packages("remotes")
require('BBmisc')
#remotes::install_github("rstudio/sass")
lib('sass')
## sass
## TRUE
/* https://stackoverflow.com/a/66029010/3806250 */
h1 { color: #002C54; }
h2 { color: #2F496E; }
h3 { color: #375E97; }
/* ----------------------------------------------------------------- */
/* https://gist.github.com/himynameisdave/c7a7ed14500d29e58149#file-broken-gradient-animation-less */
.hover01 {
/* color: #FFD64D; */
background: linear-gradient(155deg, #EDAE01 0%, #FFEB94 100%);
transition: all 0.45s;
&:hover{
background: linear-gradient(155deg, #EDAE01 20%, #FFEB94 80%);
}
}
.hover02 {
color: #FFD64D;
background: linear-gradient(155deg, #002C54 0%, #4CB5F5 100%);
transition: all 0.45s;
&:hover{
background: linear-gradient(155deg, #002C54 20%, #4CB5F5 80%);
}
}
.hover03 {
color: #FFD64D;
background: linear-gradient(155deg, #A10115 0%, #FF3C5C 100%);
transition: all 0.45s;
&:hover{
background: linear-gradient(155deg, #A10115 20%, #FF3C5C 80%);
}
}
## https://stackoverflow.com/a/36846793/3806250
options(width = 999)
knitr::opts_chunk$set(class.source = 'hover01', class.output = 'hover02', class.error = 'hover03')
if(!suppressPackageStartupMessages(require('BBmisc'))) {
install.packages('BBmisc', dependencies = TRUE, INSTALL_opts = '--no-lock')
}
suppressPackageStartupMessages(require('BBmisc'))
# suppressPackageStartupMessages(require('rmsfuns'))
pkgs <- c('devtools', 'knitr', 'kableExtra', 'tidyr',
'readr', 'lubridate', 'data.table', 'reprex',
'timetk', 'plyr', 'dplyr', 'stringr', 'magrittr',
'tdplyr', 'tidyverse', 'formattable',
'echarts4r', 'paletteer')
suppressAll(lib(pkgs))
# load_pkg(pkgs)
## Set the timezone but not change the datetime
Sys.setenv(TZ = 'Asia/Tokyo')
## options(knitr.table.format = 'html') will set all kableExtra tables to be 'html', otherwise need to set the parameter on every single table.
options(warn = -1, knitr.table.format = 'html')#, digits.secs = 6)
## https://stackoverflow.com/questions/39417003/long-vectors-not-supported-yet-abnor-in-rmd-but-not-in-r-script
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
#,
#cache = TRUE, cache.lazy = FALSE)
rm(pkgs)
Consider the code provided in the lesson “Sample code for simulating from a Mixture Model”:
# Sample code for simulating from a Mixture Model
# Generate n observations from a mixture of two Gaussian
# distributions
n = 50 # Size of the sample to be generated
w = c(0.6, 0.4) # Weights
mu = c(0, 5) # Means
sigma = c(1, 2) # Standard deviations
cc = sample(1:2, n, replace=T, prob=w)
x = rnorm(n, mu[cc], sigma[cc])
# Plot f(x) along with the observations
# just sampled
xx = seq(-5, 12, length=200)
yy = w[1]*dnorm(xx, mu[1], sigma[1]) +
w[2]*dnorm(xx, mu[2], sigma[2])
par(mar=c(4,4,1,1)+0.1)
plot(xx, yy, type="l", ylab="Density", xlab="x", las=1, lwd=2)
points(x, y=rep(0,n), pch=1)
Source : Coursera-Bayesian-Statistics-Mixture-Models/Week1 Sample Code for Sim Mix Model.R
Modify the code above to sample 200 random numbers from a mixture of 3 Poisson distributions with means 1, 2 and 6 and weights 0.7, 0.2 and 0.1, respectively, and generate a barplot with the empirical frequencies of all the integers included in the sample.
The response should follow the same template as the sample code provided above. Peer reviewers will be asked to check whether the different pieces of code have been adequately modified to reflect the fact that (1) you are working now with 3 components rather than 2, (2) that the components of the mixture are Poisson distributions rather than Gaussians, (3) that the empirical frequencies associated with the sample generated are correctly computed, (4) that a barplot is used to represent the empirical frequencies.
n = 200 # Size of the sample to be generated
w = c(0.7, 0.2, 0.1) # Weights
lamba = c(1, 2, 6) # Means
cc = sample(1:3, n, replace=TRUE, prob=w)
x = rpois(n, lamba[cc])
empfreq = table(factor(x, levels=seq(0, max(x))))/n
#barplot(empfreq)
e_common(
font_family = "Raleway",
theme = NULL
)
tibble(no = 1:nrow(empfreq), value = empfreq) |>
e_charts(no) |>
e_theme('dark-mushroom') |>
e_bar(value, stack = 'grp') |>
e_line(value, stack = 'grp2') |>
e_title('Timeline', 'Plot', sublink = 'https://echarts4r.john-coene.com/reference/e_bar.html')
Have the parameters of the mixture (both weights and component-specific parameters) been correctly specified?
The lines
n = 50
w = c(0.6, 0.4) # Weights
mu = c(0, 5) # Means
sigma = c(1, 2) # Standard deviations
in the original code should have been replaced by something like
n = 200
w = c(0.7, 0.2, 0.1) # Weights
lambda = c(1, 2, 6) # Means
Is the sampling of the indicator variables done correctly?
The line
cc = sample(1:2, n, replace=T, prob=w)
## Error in sample.int(length(x), size, replace, prob): incorrect number of probabilities
cc
## [1] 1 1 1 1 1 2 1 1 1 3 1 3 1 1 2 1 3 1 1 2 1 1 1 1 1 1 3 1 2 3 1 2 1 1 1 1 3 1 2 1 1 1 1 3 1 1 3 2 1 1 2 1 1 1 3 1 3 1 1 1 2 2 1 1 3 2 3 2 1 2 2 2 1 3 1 1 1 1 2 1 1 2 1 1 1 2 1 2 1 1 3 1 1 1 1 1 2 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 3 1 2 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 2 2 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 1 1 3 3 2 1 1 1 1 1 1 1 1 1 1 2 1 1 3 2 2 1 1 1 2
in the original code should now be something like
cc = sample(1:3, n, replace=T, prob=w)
cc
## [1] 1 2 1 1 1 2 2 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 3 1 2 2 2 2 1 1 1 2 1 1 2 1 1 1 1 1 1 2 2 1 2 1 1 3 1 2 2 1 3 1 2 1 1 2 1 1 2 2 1 1 1 2 1 1 1 1 1 2 2 2 1 1 2 1 1 1 1 1 1 1 1 2 2 1 3 1 1 1 1 1 1 1 1 1 1 1 2 1 1 3 1 1 3 1 1 1 2 1 1 1 2 1 1 1 2 1 1 2 1 1 1 1 2 1 1 2 3 3 1 1 1 3 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 2 3 1 1 1 1 1 3 1 3 2 1 1 1 1 1 1 1 1 1 1 2 3 1 2 1 1 1 1 1 2 1 2
to reflect the fact that we now have 3 components in the mixture.
Assuming that the indicator variables have been correctly sampled, are the observations being sampled correctly?
The line
x = rnorm(n, mu[cc], sigma[cc])
x
## [1] 0.212872872 5.921317560 0.875968855 -0.515876648 -0.162261507 8.250833869 7.524761742 -0.848905082 0.326508711 -3.339498394 -0.386554579 3.140937174 -0.260173822 0.636804314 0.972256069 0.495329621 -1.699295125 3.704314409 0.027854992 0.439442362 -0.626504712 -0.786550630 -0.274770177 0.659614199 6.887070722 0.062848292 3.515230561 0.175651221 NaN 1.771097466 6.257130770 3.496655408 5.773869467 4.533442290 -0.723204657 0.482203584 -0.818526993 5.460387289 -0.225837171 -0.313664797 3.208343573 -1.311446115 -0.832691989 -0.052123823 -0.396055804 0.358422653 1.819256535 6.999404072 5.278373783 -0.271475575 4.518071344 -1.092237020 0.975893816 NaN 0.239038653 5.665186969 4.248484390 -0.454585874 NaN -0.707138970 6.312059767 0.260199670 2.375909640 0.947824223 -0.093198058 0.077352746 3.681068681 3.252274781 0.361114682 -0.959956355 -0.474830362 4.769732198 1.887893736 1.742973671 -0.190696789 0.017186934
## [77] 1.504653470 2.876315233 5.198555164 7.930265009 -0.519741590 -1.621651031 7.687684224 -0.199643525 0.786132748 -0.856765987 0.005721487 -0.965228161 0.445303869 -0.577097209 -0.938589264 7.705255537 8.200668759 -0.461294395 NaN 1.035659381 0.030832176 -0.670459502 0.301847644 1.329559883 0.251472191 -0.401104470 0.405761005 -1.107333993 -0.717942727 -0.422187248 5.081967162 -1.634156331 -0.391269141 NaN -0.599073914 0.464770080 NaN 0.450614621 -0.186750899 -0.001176947 1.331036356 0.161883601 0.373558445 -1.635928348 4.274374221 0.098839208 1.066055572 -0.168488487 4.504562741 1.026778764 -1.832810083 4.840103949 -0.720912132 -2.505837092 -1.160364202 1.909083133 5.680910376 1.119421580 0.162077291 4.122792845 NaN NaN 0.129493815 -0.968581775 0.628197574 NaN -0.680969366 -0.102705294 -0.539013999 -0.259712106 1.469267187 0.424869230 -0.947855076 1.474101570 3.610866390 0.120552852
## [153] -0.134067627 1.475719097 -0.781645428 -1.600403010 5.942392036 -0.814667606 -0.099417339 0.145083408 -1.513511110 6.336085025 -0.252212572 0.839267694 0.142092317 -0.660867871 -2.792261133 2.551432989 NaN 0.723469684 0.072188707 1.319216014 -0.478923162 -0.514121786 NaN 0.789791425 NaN 4.241280618 -0.617515787 -0.130167954 -0.141889204 -0.932697127 0.134553432 1.027818580 0.849807157 0.610423935 -0.813874168 1.260186656 5.045383709 NaN 1.000206050 1.647110179 0.550144350 1.649309954 -0.702852393 0.232517932 -0.609529270 2.143966169 0.653449247 3.428412610
should now be something like
x = rpois(n, lambda[cc])
x
## [1] 0 2 2 1 1 2 3 0 0 0 2 1 0 5 0 1 3 2 0 1 2 0 1 3 1 0 0 1 3 1 2 4 3 0 1 0 0 3 0 0 1 2 2 0 1 0 1 2 2 2 1 2 0 8 1 1 3 1 4 0 1 0 0 1 0 0 2 3 3 1 0 2 1 0 1 0 1 1 0 1 1 1 3 0 0 1 2 3 0 1 2 0 2 0 8 1 2 1 1 0 0 2 0 1 1 1 0 1 2 6 0 0 8 0 0 0 2 0 1 1 2 2 2 1 2 0 0 1 1 2 0 0 3 1 1 0 4 4 1 0 2 10 1 0 1 0 1 1 0 0 1 1 0 1 0 0 3 1 0 2 0 2 1 0 0 1 1 2 8 0 3 3 1 0 9 1 6 1 1 2 2 2 0 0 1 0 0 1 2 1 0 3 1 1 0 1 1 1 1 2
to reflect the fact that the components of the mixture are all Poisson distributions.
Are the empirical frequencies computed correctly?
The code used to compute the frequencies should look something like
empfreq = table(factor(x, levels=seq(0, max(x))))/n
empfreq
##
## 0 1 2 3 4 5 6 7 8 9 10
## 0.335 0.340 0.180 0.080 0.020 0.005 0.010 0.000 0.020 0.005 0.005
The main difficulty here is how to properly compute the frequencies using R, rather than anything that is specific to mixture models. Using the function table()
directly on the vector \(x\) will ignore values with zero counts. Similarly, the function tabulate()
will ignore the \(x = 0\) case. First converting the vector \(x\) into a factor while ensuring that any integer between 0 and the maximum in the sample are valid factors avoids the issue.
empfreq = table(x)/n
empfreq
## x
## 0 1 2 3 4 5 6 8 9 10
## 0.335 0.340 0.180 0.080 0.020 0.005 0.010 0.020 0.005 0.005
Partial credit should also be given if counts rather than frequencies are reported (i.e., if the answer does not divide by \(n\)).
Does the barplot appear correct?
The code should be something like
barplot(empfreq)
or
hist(x, right=F)
When using the hist function, the option right=F must be specified so that the intervals/bins are specified correctly. This ensures that the intervals are [0,1), [1,2),…,[10,11]. Note that the last interval is a closed one. In the default setting (right=T), the first category [0,1] is closed. That is, the left and right endpoints are included.
Enter code to sample 200 random numbers from a mixture of 3 Poisson distributions with means 1, 2 and 6 and weights 0.7, 0.2 and 0.1, respectively.
n = 200 # Size of the sample to be generated
w = c(0.7, 0.2, 0.1) # Weights
lamba = c(1, 2, 6) # Means
cc = sample(1:3, n, replace=T, prob=w)
x = rpois(n, lamba[cc])
Enter code to generate a barplot with the empirical frequencies of all the integers included in the sample.
empfreq = table(factor(x, levels=seq(0, max(x))))/n
barplot(empfreq)
Have the parameters of the mixture (both weights and component-specific parameters) been correctly specified?
The lines
n = 50
w = c(0.6, 0.4) # Weights
mu = c(0, 5) # Means
sigma = c(1, 2) # Standard deviations
in the original code should have been replaced by something like
n = 200
w = c(0.7, 0.2, 0.1) # Weights
lambda = c(1, 2, 6) # Means
Is the sampling of the indicator variables done correctly?
The line
cc = sample(1:2, n, replace=T, prob=w)
## Error in sample.int(length(x), size, replace, prob): incorrect number of probabilities
cc
## [1] 1 1 1 1 3 1 1 3 2 2 1 1 3 1 2 1 1 1 1 1 1 1 1 3 1 1 1 1 2 2 3 1 1 1 3 1 2 1 1 2 1 2 2 2 1 1 1 2 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 3 1 1 1 2 1 1 1 2 3 1 2 1 1 1 1 1 1 3 1 3 1 1 1 1 1 3 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 3 2 1 1 1 2 1 1 2 1 1 1 3 1 1 1 2 2 1 1 1 1 1 1 1 3 1 2 1 2 3 1 1 1 1 1 1 1 1 1 1 3 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 3 1 1 1 2 1 1 1 1 1 2 3 2 1 1 3 2 3 2 1
in the original code should now be something like
cc = sample(1:3, n, replace=T, prob=w)
cc
## [1] 1 2 1 2 1 1 1 1 1 2 1 1 2 3 1 1 1 1 1 1 2 2 3 1 2 2 1 1 2 1 1 1 2 1 1 1 1 3 1 1 1 1 1 1 1 2 1 1 3 1 3 1 1 1 2 1 2 1 1 1 3 1 1 2 2 2 1 1 1 1 2 1 2 1 1 1 1 1 2 2 1 1 1 3 1 1 1 1 1 3 1 2 2 2 1 3 2 1 1 1 1 1 1 3 1 2 1 1 3 2 2 2 1 2 1 1 2 1 1 1 1 2 1 1 2 1 3 1 1 1 1 1 2 1 2 1 1 1 1 2 1 1 3 2 1 1 1 1 2 1 2 2 3 1 1 1 1 1 1 1 1 2 1 2 1 1 2 2 1 1 1 1 1 1 2 2 2 1 1 1 3 1 1 3 1 1 1 1 1 1 2 2 2 1 2 1 2 3 1 1
to reflect the fact that we now have 3 components in the mixture.
Assuming that the indicator variables have been correctly sampled, are the observations being sampled correctly?
The line
x = rnorm(n, mu[cc], sigma[cc])
x
## [1] 1.235743165 7.344719143 1.408751229 7.420378759 1.165751316 0.732260537 0.057361282 0.465841950 0.085082339 1.763988712 -1.467358240 -1.359339287 9.298978295 NaN 0.554890679 -1.112496312 -0.870171108 -1.070596429 0.210014363 0.640968852 5.614533247 7.393165599 NaN 1.044539487 2.408227959 4.321065919 0.180577433 -0.298534589 5.581980501 1.030511132 -0.423474826 0.949315796 3.395425515 0.213908998 1.226942817 1.402285997 0.092693580 NaN 0.045327229 1.732372378 0.221968015 0.639753147 0.426437455 0.954636871 0.111420749 3.400235816 -1.825091379 0.954143379 NaN -0.395080673 NaN 0.916707931 -0.092657480 -0.240094210 5.988156751 -0.891652828 3.809171997 0.804084268 0.768375809 -0.897195732 NaN -0.030040465 0.529797443 4.858709902 5.990224441 4.726760888 -0.753837237 0.383488778 -0.571802901 1.125199924 3.468771471 0.666883256 4.437234680 -0.580328238 2.338695997 -0.304344502
## [77] 1.673671246 1.451389881 -0.510990103 7.893448873 -1.052111719 -0.827423049 2.497490129 NaN 0.374345928 1.021177923 -0.418952699 -0.862441707 1.961173877 NaN -1.918822745 4.759465522 1.683813088 2.504631973 1.353403849 NaN 5.328241541 -0.097550607 0.354735471 -0.202175083 1.463228331 0.543096905 1.224399361 NaN 0.128796215 6.197565919 1.732282434 -0.564747572 NaN 0.621234120 10.907486146 5.516209921 0.119846604 1.960619273 1.379610505 -0.381231110 5.863775974 0.873815335 0.500165529 0.677588816 0.447122370 6.774697460 -1.069833352 -0.169445155 6.103556537 -0.978123417 NaN -0.503056398 -1.552051937 0.224154334 -0.456912044 -0.251731527 2.318710964 -0.560714936 3.924347288 0.993864418 1.022178891 -1.255358958 -1.300791769 6.292824688 -0.372718793 -2.521348586 NaN 5.886138799 -0.337651415 -0.342132099 -0.700286190 -2.131314185 5.890650926 2.024670925 5.246738037 2.425654948
## [153] NaN -0.262779298 0.360569112 -0.356880514 -0.132212779 -1.068194466 0.931097845 0.063095835 -0.817426757 3.902853058 -0.552514467 4.320529856 -0.510029625 0.337884100 3.340511021 6.778848874 0.604245192 -1.310844641 -1.725373774 1.681115525 -0.814487821 -1.239038873 4.841883094 2.427211516 3.846529905 -1.493648234 0.081664558 -0.677390192 NaN 0.276189403 0.611143657 NaN 1.399380849 -1.175691205 0.730986390 2.266213947 1.025172903 0.024644246 4.171443624 3.046457539 6.333831390 0.009665193 5.024237712 0.695698887 6.050112594 NaN -0.827851587 -0.149902879
should now be something like
x = rpois(n, lambda[cc])
x
## [1] 0 1 1 2 0 1 0 0 0 3 0 1 1 7 2 3 2 0 1 3 2 2 2 1 2 2 1 5 3 0 1 0 1 1 0 1 1 4 1 1 2 2 1 1 3 2 1 2 7 1 9 1 1 1 3 1 2 0 0 0 5 0 0 1 3 1 1 0 2 1 0 0 2 0 2 0 0 2 2 3 1 2 2 4 2 1 0 2 1 4 1 1 2 0 3 7 0 1 2 3 2 0 1 6 0 3 3 2 7 2 2 0 2 1 0 3 1 1 3 0 1 4 1 2 0 1 7 2 1 0 2 2 2 0 0 2 1 0 1 2 1 0 3 3 0 1 1 1 1 0 2 4 4 1 2 0 0 0 0 0 2 2 0 1 1 0 2 5 1 0 2 1 1 1 1 2 0 1 2 0 5 1 3 2 0 1 4 1 2 1 2 0 2 1 2 0 1 8 1 1
to reflect the fact that the components of the mixture are all Poisson distributions.
Are the empirical frequencies computed correctly?
The code used to compute the frequencies should look something like
empfreq = table(factor(x, levels=seq(0, max(x))))/n
empfreq
##
## 0 1 2 3 4 5 6 7 8 9
## 0.250 0.325 0.245 0.085 0.035 0.020 0.005 0.025 0.005 0.005
The main difficulty here is how to properly compute the frequencies using R, rather than anything that is specific to mixture models. Using the function table()
directly on the vector \(x\) will ignore values with zero counts. Similarly, the function tabulate()
will ignore the \(x = 0\) case. First converting the vector \(x\) into a factor while ensuring that any integer between 0 and the maximum in the sample are valid factors avoids the issue.
empfreq = table(x)/n
empfreq
## x
## 0 1 2 3 4 5 6 7 8 9
## 0.250 0.325 0.245 0.085 0.035 0.020 0.005 0.025 0.005 0.005
Partial credit should also be given if counts rather than frequencies are reported (i.e., if the answer does not divide by \(n\)).
Does the barplot appear correct?
The code should be something like
barplot(empfreq)
or
hist(x, right=F)
When using the hist function, the option right=F must be specified so that the intervals/bins are specified correctly. This ensures that the intervals are [0,1), [1,2),…,[10,11]. Note that the last interval is a closed one. In the default setting (right=T), the first category [0,1] is closed. That is, the left and right endpoints are included.
Enter code to sample 200 random numbers from a mixture of 3 Poisson distributions with means 1, 2 and 6 and weights 0.7, 0.2 and 0.1, respectively.
set.seed(123)
## Generate n observations from a mixture of three Poisson distributions
n = 200 # Size of the sample to be generated
w = c(0.7, 0.2, 0.1) # Weights
lambda = c(1, 2, 6) # Means (also Variances) of 3 Poisson models
# Each of n Observations randomly selects one out of 3 Poisson models
cc = sample(1:3, n, replace=T, prob=w)
# Value of each observations depends on selected Poisson model
yy = rpois(n, lambda[cc])
Enter code to generate a barplot with the empirical frequencies of all the integers included in the sample.
# range of Values for X-axis
xx = c(1 : (max(yy) + 5))
# build Frequency list based on occurrences of Values for y-axis
zz = NULL
for (i in xx) {
zz <- append(zz, length(which(yy==i)))
}
# BarPlot of Frequency vs Values observed
par(mar=c(4,4,1,1)+0.1)
barplot(zz, names.arg=xx, width=1.0,
main="Values Generated by Mixture of 3 Poisson Models",
xlim=c(0, (max(yy) + 6)), ylim=c(0, max(zz)+0.2),
ylab="Frequency", xlab="Value")
Have the parameters of the mixture (both weights and component-specific parameters) been correctly specified?
The lines
n = 50
w = c(0.6, 0.4) # Weights
mu = c(0, 5) # Means
sigma = c(1, 2) # Standard deviations
in the original code should have been replaced by something like
n = 200
w = c(0.7, 0.2, 0.1) # Weights
lambda = c(1, 2, 6) # Means
Is the sampling of the indicator variables done correctly?
The line
cc = sample(1:2, n, replace=T, prob=w)
## Error in sample.int(length(x), size, replace, prob): incorrect number of probabilities
cc
## [1] 1 2 1 2 3 1 1 2 1 1 3 1 1 1 1 2 1 1 1 3 2 1 1 3 1 2 1 1 1 1 3 3 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 2 1 2 2 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 3 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 3 1 2 3 1 1 1 3 1 1 3 2 1 1 3 1 1 1 1 1 1 1 3 1 1 1 1 1 2 1 2 1 1 2 2 3 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1 1 1 1 3 3 1 1 3 1 3 1 1 1 1 1
in the original code should now be something like
cc = sample(1:3, n, replace=T, prob=w)
cc
## [1] 3 1 3 1 1 1 2 1 1 1 1 2 1 1 1 1 3 1 1 1 1 1 1 1 2 1 1 1 1 2 3 1 1 3 1 1 1 1 1 1 1 1 2 1 2 2 2 1 1 1 1 1 1 1 1 2 2 2 1 1 3 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 2 1 2 1 1 3 1 1 1 1 2 3 1 1 2 1 3 1 1 1 2 1 1 1 1 1 1 1 1 3 1 1 1 2 1 2 1 1 2 1 2 1 1 1 1 1 1 3 1 3 1 3 1 1 2 1 2 1 2 1 1 2 1 2 1 2 3 1 2 2 1 1 1 1 2 1 1 1 1 1 1 1 3 2 1 1 1 1 2 1 2 1 3 1 1 3 2 1 1 1 1 2 2 2 1 2 1 1 1 3 1 1 1 3 1 1 2 1 1 2 1
to reflect the fact that we now have 3 components in the mixture.
Assuming that the indicator variables have been correctly sampled, are the observations being sampled correctly?
The line
x = rnorm(n, mu[cc], sigma[cc])
x
## [1] NaN -0.71524219 NaN -0.75268897 -0.93853870 -1.05251328 4.12568093 0.33117917 -2.01421050 0.21198043 1.23667505 9.07514804 1.30117599 0.75677476 -1.72673040 -0.60150671 NaN -0.35204646 0.70352390 -0.10567133 -1.25864863 1.68443571 0.91139129 0.23743027 7.43621722 -1.33877429 0.66082030 -0.52291238 0.68374552 4.87835609 NaN 0.63296071 1.33551762 NaN 0.00729009 1.01755864 -1.18843404 -0.72160444 1.51921771 0.37738797 -2.05222282 -1.36403745 4.59843797 0.86577940 4.79623349 6.24837494 6.91801076 1.67105483 0.05601673 -0.05198191 -1.75323736 0.09932759 -0.57185006 -0.97400958 -0.17990623 7.02988635 1.01450302 4.14544143 0.11663728 -0.89320757 NaN 0.33390294 0.41142992 -0.03303616 -2.46589819 2.57145815 -0.20529926 0.65119328 0.27376649 7.04934647 0.81765945 4.58041366 0.37816777 3.10918234 0.85692301 -0.46103834 2.41677335 -1.65104890 -0.46398724 6.65075973 0.51013255 3.82103792
## [83] -0.99678074 0.14447570 NaN -0.01430741 -1.79028124 0.03455107 0.19023032 5.34945279 NaN -1.05501704 0.47613328 7.75714027 0.45623640 NaN -1.13558847 -0.43564547 0.34610362 3.70590874 -2.15764634 0.88425082 -0.82947761 -0.57356027 1.50390061 -0.77414493 0.84573154 -1.26068288 NaN -0.35454240 -0.07355602 -1.16865142 3.73050347 -0.02884155 6.34139194 -1.65054654 -0.34975424 6.51281288 -0.53880916 5.45458384 0.49222857 0.26783502 0.65325768 -0.12270866 -0.41367651 -2.64314895 NaN -0.09294102 NaN 0.43028470 NaN 0.53539884 -0.55527835 8.55900582 0.28642442 5.25263172 1.27226678 3.56306756 -0.45033862 2.39745248 5.02225837 1.63356842 2.12298671 -0.19051680 5.75684781 NaN 0.30003855 2.98872748 5.03851855 -1.07742065 0.71270333 1.08477509 -2.22498770 7.47138692 -1.24104450 0.45476927 0.65990264 -0.19988983 -0.64511396 0.16532102 0.43881870 NaN 6.76660564 -2.05233698
## [165] -1.63637927 1.43040234 1.04662885 5.87057790 0.71517841 6.83434984 -2.66092280 NaN 1.11027710 -0.48498760 NaN 5.46123366 -0.29515780 0.87196495 -0.34847245 0.51850377 4.21863004 2.81442558 7.42002102 0.74090001 8.44852448 0.06515393 1.12500275 1.97541905 NaN -0.28148212 -1.32295111 -0.23935157 NaN -0.21404124 0.15168050 8.42460995 -0.32614389 0.37300466 4.54463187 0.02045071
should now be something like
x = rpois(n, lambda[cc])
x
## [1] 7 0 9 1 1 0 3 1 2 0 2 1 0 1 3 0 5 1 1 1 3 1 2 2 1 0 0 1 0 1 6 1 1 8 1 1 1 0 0 1 0 2 3 1 3 1 0 1 1 0 0 1 0 2 1 2 2 1 1 2 6 1 2 0 1 1 0 1 0 2 1 0 1 2 0 1 1 1 2 3 1 2 0 0 5 0 2 4 1 3 9 1 1 1 1 4 0 1 1 1 0 1 0 1 1 0 3 1 8 2 2 2 2 1 1 0 1 1 1 1 0 1 2 0 0 0 8 1 8 0 3 0 2 3 4 2 0 1 2 1 1 2 4 1 2 10 3 2 0 2 1 1 0 1 1 3 0 1 0 0 1 5 1 0 3 1 0 2 1 3 1 6 1 2 5 1 1 0 2 1 0 5 1 1 2 1 0 0 9 1 2 2 6 1 0 3 1 0 1 0
to reflect the fact that the components of the mixture are all Poisson distributions.
Are the empirical frequencies computed correctly?
The code used to compute the frequencies should look something like
empfreq = table(factor(x, levels=seq(0, max(x))))/n
empfreq
##
## 0 1 2 3 4 5 6 7 8 9 10
## 0.245 0.405 0.165 0.075 0.020 0.025 0.020 0.005 0.020 0.015 0.005
The main difficulty here is how to properly compute the frequencies using R, rather than anything that is specific to mixture models. Using the function table()
directly on the vector \(x\) will ignore values with zero counts. Similarly, the function tabulate()
will ignore the \(x = 0\) case. First converting the vector \(x\) into a factor while ensuring that any integer between 0 and the maximum in the sample are valid factors avoids the issue.
empfreq = table(x)/n
empfreq
## x
## 0 1 2 3 4 5 6 7 8 9 10
## 0.245 0.405 0.165 0.075 0.020 0.025 0.020 0.005 0.020 0.015 0.005
Partial credit should also be given if counts rather than frequencies are reported (i.e., if the answer does not divide by \(n\)).
Does the barplot appear correct?
The code should be something like
barplot(empfreq)
or
hist(x, right=F)
When using the hist function, the option right=F must be specified so that the intervals/bins are specified correctly. This ensures that the intervals are [0,1), [1,2),…,[10,11]. Note that the last interval is a closed one. In the default setting (right=T), the first category [0,1] is closed. That is, the left and right endpoints are included.
Enter code to sample 200 random numbers from a mixture of 3 Poisson distributions with means 1, 2 and 6 and weights 0.7, 0.2 and 0.1, respectively.
# Generate n observations from a mixture of three Poison
# distributions
n = 200 # Size of the sample to be generated
w = c(0.7, 0.2, 0.1) # Weights
mu = c(1, 2, 6) # Means
xx = seq(0, 17, by =1)
yy = w[1]*dpois(xx, mu[1]) + w[2]*dpois(xx, mu[2]) + w[3]*dpois(xx, mu[3])
Enter code to generate a barplot with the empirical frequencies of all the integers included in the sample.
# Plot f(x) along with the observations
# just sampled
xx = seq(0, 17, by = 1)
xx = seq(0, 17, by =1)
yy = w[1]*dpois(xx, mu[1]) + w[2]*dpois(xx, mu[2]) + w[3]*dpois(xx, mu[3])
par(mar=c(4,4,1,1)+0.1)
plot(xx, yy, type="l", ylab="Density", xlab="x", las=1, lwd=2)
Have the parameters of the mixture (both weights and component-specific parameters) been correctly specified?
The lines
n = 50
w = c(0.6, 0.4) # Weights
mu = c(0, 5) # Means
sigma = c(1, 2) # Standard deviations
in the original code should have been replaced by something like
n = 200
w = c(0.7, 0.2, 0.1) # Weights
lambda = c(1, 2, 6) # Means
Is the sampling of the indicator variables done correctly?
The line
cc = sample(1:2, n, replace=T, prob=w)
## Error in sample.int(length(x), size, replace, prob): incorrect number of probabilities
cc
## [1] 3 1 3 1 1 1 2 1 1 1 1 2 1 1 1 1 3 1 1 1 1 1 1 1 2 1 1 1 1 2 3 1 1 3 1 1 1 1 1 1 1 1 2 1 2 2 2 1 1 1 1 1 1 1 1 2 2 2 1 1 3 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 2 1 2 1 1 3 1 1 1 1 2 3 1 1 2 1 3 1 1 1 2 1 1 1 1 1 1 1 1 3 1 1 1 2 1 2 1 1 2 1 2 1 1 1 1 1 1 3 1 3 1 3 1 1 2 1 2 1 2 1 1 2 1 2 1 2 3 1 2 2 1 1 1 1 2 1 1 1 1 1 1 1 3 2 1 1 1 1 2 1 2 1 3 1 1 3 2 1 1 1 1 2 2 2 1 2 1 1 1 3 1 1 1 3 1 1 2 1 1 2 1
in the original code should now be something like
cc = sample(1:3, n, replace=T, prob=w)
cc
## [1] 1 1 1 1 1 1 1 3 1 1 1 3 1 1 2 1 1 1 1 1 1 2 2 1 1 2 1 1 1 1 1 2 1 1 2 1 3 1 2 2 1 2 1 3 1 1 2 1 1 3 1 1 1 2 1 2 1 3 1 1 1 2 3 1 1 1 2 1 1 1 1 1 1 1 1 2 3 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 2 1 1 3 1 1 1 1 1 2 1 1 3 1 3 1 2 1 1 2 1 2 1 1 1 1 2 1 2 1 1 2 1 1 1 2 1 1 2 1 1 2 2 2 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 2 1 3 1 3 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1
to reflect the fact that we now have 3 components in the mixture.
Assuming that the indicator variables have been correctly sampled, are the observations being sampled correctly?
The line
x = rnorm(n, mu[cc], sigma[cc])
x
## [1] 0.982113003 -0.727383529 -0.996838982 -1.041688859 -0.414588726 -0.239029068 0.483617534 NaN -0.321324842 -2.078489269 -0.091434276 NaN 1.187186811 1.191601269 3.422073564 -1.547776544 2.458060492 -0.162421942 -0.097451250 0.420574189 -1.614039457 3.543561778 1.919115189 -0.693094614 0.118849433 2.270581084 0.589982679 0.289344029 -0.904215026 0.226324942 0.748081162 7.122190507 -0.212848279 -0.093636794 4.826571730 1.441461756 NaN 1.125071892 6.668803136 4.425318400 0.373241434 5.806580662 -1.041673294 NaN -1.728304515 0.641830028 1.941378938 0.001683688 0.250247821 NaN 0.563867390 0.189426238 -0.732853806 6.972731720 1.738633767 6.762357618 -1.943650901 NaN 1.399576185 -0.056055946 0.524914279 6.244066472 NaN -0.096686073 -0.075263198 1.019157069 6.423203845 0.990262246 2.382926695 0.664415864 0.207381157 -2.210633111 2.691714003 -0.482676822 2.374734715 5.749287136
## [77] NaN 1.538430199 -0.109710321 0.511470755 0.213957980 -0.186120699 4.759212351 1.012834336 -0.201458147 -2.037682494 -0.195889249 6.079581211 6.232911433 0.616567817 -1.692101521 0.368742058 0.967859210 7.553157363 -0.224961271 -0.321892586 NaN 1.487837832 -1.667928046 -0.436829977 0.457462079 -1.617773765 5.559255725 1.877864021 -0.004060653 NaN -0.278454025 NaN 0.474911714 4.441855658 0.813400374 0.904435464 5.005383322 -1.176692158 2.363558545 -0.592997366 0.797380501 -1.958205175 -1.886325159 3.692440350 0.394394848 3.172867904 0.886749037 0.333369970 4.658720764 0.818828137 0.388365163 -0.445935027 5.462229869 0.647513358 0.356283345 3.683979584 0.855202205 1.152936226 5.552549113 5.288209325 4.848749838 2.161415846 0.276315532 -0.158294029 -2.507917802 1.869436470 4.844653599 0.206294045 0.276872459 0.821506780 -0.194152409 1.214588794 -0.921516043 -1.208442720 2.542027645 0.742297024
## [153] 4.834160115 0.789817922 4.464587157 -0.591892102 -0.368352575 -1.852616824 2.660769479 -1.442034646 NaN 1.054322272 NaN -0.597330091 0.789459853 1.516490602 -0.191774809 0.283878906 -1.751067519 -0.818669778 0.056214849 0.299086903 -0.759398117 2.684858999 4.083219722 5.128487113 6.299583734 -0.026018635 -0.643567387 1.045305664 1.615545323 -0.029693971 0.562267345 NaN -0.097412499 1.016455218 -1.156167394 2.320860224 -0.603531248 -1.458849414 -0.350917827 0.146708484 1.623621208 0.911209677 0.142458432 2.221032952 -0.866037745 -0.163284932 2.553026113 -1.860227574
should now be something like
x = rpois(n, lambda[cc])
x
## [1] 2 1 0 2 3 2 0 11 1 0 0 1 2 2 4 1 3 2 2 1 3 2 4 1 1 4 1 0 0 1 1 6 2 0 3 2 7 1 1 2 0 2 0 4 1 1 1 0 1 7 0 0 3 1 1 3 0 2 1 2 0 1 7 3 1 1 1 2 0 1 1 0 3 1 2 2 4 1 2 1 1 1 2 1 1 0 4 2 0 2 1 2 0 4 1 2 9 1 3 0 0 0 2 0 3 8 0 6 1 3 1 0 2 0 3 0 2 2 1 0 1 2 4 0 2 3 1 1 0 2 1 1 1 2 2 2 1 2 0 1 0 4 2 0 2 0 2 0 0 0 0 2 1 1 1 1 1 4 1 1 11 1 5 2 0 1 1 0 0 2 1 1 1 1 2 2 0 1 1 1 1 0 0 7 2 0 2 1 4 1 1 1 1 0 1 5 1 1 0 0
to reflect the fact that the components of the mixture are all Poisson distributions.
Are the empirical frequencies computed correctly?
The code used to compute the frequencies should look something like
empfreq = table(factor(x, levels=seq(0, max(x))))/n
empfreq
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## 0.240 0.360 0.220 0.065 0.055 0.010 0.010 0.020 0.005 0.005 0.000 0.010
The main difficulty here is how to properly compute the frequencies using R, rather than anything that is specific to mixture models. Using the function table()
directly on the vector \(x\) will ignore values with zero counts. Similarly, the function tabulate()
will ignore the \(x = 0\) case. First converting the vector \(x\) into a factor while ensuring that any integer between 0 and the maximum in the sample are valid factors avoids the issue.
empfreq = table(x)/n
empfreq
## x
## 0 1 2 3 4 5 6 7 8 9 11
## 0.240 0.360 0.220 0.065 0.055 0.010 0.010 0.020 0.005 0.005 0.010
Partial credit should also be given if counts rather than frequencies are reported (i.e., if the answer does not divide by \(n\)).
Does the barplot appear correct?
The code should be something like
barplot(empfreq)
or
hist(x, right=F)
When using the hist function, the option right=F must be specified so that the intervals/bins are specified correctly. This ensures that the intervals are [0,1), [1,2),…,[10,11]. Note that the last interval is a closed one. In the default setting (right=T), the first category [0,1] is closed. That is, the left and right endpoints are included.
It’s useful to record some information about how your file was created.
suppressMessages(require('dplyr', quietly = TRUE))
suppressMessages(require('magrittr', quietly = TRUE))
suppressMessages(require('formattable', quietly = TRUE))
suppressMessages(require('knitr', quietly = TRUE))
suppressMessages(require('kableExtra', quietly = TRUE))
sys1 <- devtools::session_info()$platform %>%
unlist %>% data.frame(Category = names(.), session_info = .)
rownames(sys1) <- NULL
sys2 <- data.frame(Sys.info()) %>%
dplyr::mutate(Category = rownames(.)) %>% .[2:1]
names(sys2)[2] <- c('Sys.info')
rownames(sys2) <- NULL
if (nrow(sys1) == 9 & nrow(sys2) == 8) {
sys2 %<>% rbind(., data.frame(
Category = 'Current time',
Sys.info = paste(as.character(lubridate::now('Asia/Tokyo')), 'JST🗾')))
} else {
sys1 %<>% rbind(., data.frame(
Category = 'Current time',
session_info = paste(as.character(lubridate::now('Asia/Tokyo')), 'JST🗾')))
}
sys <- cbind(sys1, sys2) %>%
kbl(caption = 'Additional session information:') %>%
kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
row_spec(0, background = 'DimGrey', color = 'yellow') %>%
column_spec(1, background = 'CornflowerBlue', color = 'red') %>%
column_spec(2, background = 'grey', color = 'black') %>%
column_spec(3, background = 'CornflowerBlue', color = 'blue') %>%
column_spec(4, background = 'grey', color = 'white') %>%
row_spec(9, bold = T, color = 'yellow', background = '#D7261E')
rm(sys1, sys2)
sys
Category | session_info | Category | Sys.info |
---|---|---|---|
version | R version 4.1.0 (2021-05-18) | sysname | Linux |
os | Ubuntu 20.04.2 LTS | release | 5.8.0-54-generic |
system | x86_64, linux-gnu | version | #61~20.04.1-Ubuntu SMP Thu May 13 00:05:49 UTC 2021 |
ui | X11 | nodename | Scibrokes-Trading |
language | en | machine | x86_64 |
collate | en_US.UTF-8 | login | englianhu |
ctype | en_US.UTF-8 | user | englianhu |
tz | Asia/Tokyo | effective_user | englianhu |
date | 2021-05-22 | Current time | 2021-05-22 19:50:18 JST🗾 |