Supervised Learning | Unsupervised Learning | Reinforcement Learning |
---|---|---|
Use labels to train data on features | Find hidden patterns when there is no labeled data | Take actions based in policies and rewards with some delayed feedback |
Which of these uses ML?
Accuracy of a model is about reducing FP and FN:
Extrapolation is the use of a regression line for prediction far outside the domain of values of the feature variable x that you use to predict y. This can be dangerous!
If we draw several independent random samples from a population, each of a fixed large sample size, and plot a distribution of their means (or some point estimate), this sampling distribution of means will approach normal.
CLT helps us address sampling variability by constructing a sampling distribution centered around our sample mean, usually using a 95% confidennce interval. This is because if we picked another sample, there's a 95% likelihood that the new sample mean's 95% range would have the population mean.
Estimated 95% range is: xÌ… +/- 1.96*es.
library(dplyr)
# proving CLT
population = sample(1:100,size=10000,replace=T) # let’s say this is our population, don't confuse by the name of the function sample with sample
populationmean = mean(population)
samplingdistribution = NULL # sampling distribution of means of samples
numberofsamples = 100 # number of samples or sample means in this distribution
lowrange = highrange = moe = NULL
samplesize = 900 # less than 10% but > 30 to establish independence
for (i in 1:numberofsamples) # run experiment many times
{
y = sample(population,size=samplesize,replace = T)
samplingdistribution = append(samplingdistribution,mean(y))
# check if 95% confidence interval of this sample contains population mean
lowrange = append(lowrange,psych::describe(y)$mean-1.96*psych::describe(y)$se)
highrange = append(highrange,psych::describe(y)$mean+1.96*psych::describe(y)$se)
}
# 95% of 95% ranges show population mean within the range proving CLT
allranges = data.frame(samplingdistribution, lowrange, highrange, populationmean)
head(allranges)
samplingdistribution lowrange highrange populationmean
1 49.90667 48.01250 51.80083 50.3981
2 47.72667 45.81388 49.63945 50.3981
3 51.09111 49.21649 52.96573 50.3981
4 49.58778 47.66128 51.51427 50.3981
5 51.74000 49.83871 53.64129 50.3981
6 50.31000 48.42538 52.19462 50.3981
withinrange = allranges %>% filter( (populationmean) & (populationmean <= highrange))
head(withinrange)
samplingdistribution lowrange highrange populationmean
1 49.90667 48.01250 51.80083 50.3981
2 51.09111 49.21649 52.96573 50.3981
3 49.58778 47.66128 51.51427 50.3981
4 51.74000 49.83871 53.64129 50.3981
5 50.31000 48.42538 52.19462 50.3981
6 49.57556 47.74120 51.40991 50.3981
nrow(withinrange)/nrow(allranges) * 100
[1] 98
# normal distribution of the sampling distribution proving CLT
samplingdistribution = as.data.frame(samplingdistribution)
head(samplingdistribution)
samplingdistribution
1 49.90667
2 47.72667
3 51.09111
4 49.58778
5 51.74000
6 50.31000
population = as.data.frame(population)
head(population)
population
1 93
2 57
3 4
4 35
5 90
6 95
library(ggplot2)
# linear regression
# y = ax + intercept
x = c(1,2,3,4,5) # training data
y = c(2,4,6,8,10) # training data
lrModel = lm(y~x)
print(lrModel)
Call:
lm(formula = y ~ x)
Coefficients:
(Intercept) x
2.383e-15 2.000e+00
df = data.frame(x,y)
df
x y
1 1 2
2 2 4
3 3 6
4 4 8
5 5 10
ggplot(df,aes(x=x,y=y)) + geom_point(color="blue") + geom_smooth (model="lm",formula=y~x)
library(ggplot2)
# linear regression
# y = ax + intercept
x = c(1,2,3,4,5) # training data
y = c(1,8,27,64,125) # training data
lrModel = lm(y~I(x^3))
print(lrModel)
Call:
lm(formula = y ~ I(x^3))
Coefficients:
(Intercept) I(x^3)
0 1
df = data.frame(x,y)
ggplot(df,aes(x=x,y=y)) + geom_point(color="blue") + geom_smooth (model="lm",formula=y~I(x^3))
predict(lrModel,data.frame(x=c(6,7,8,9,10))) # predict on test data
1 2 3 4 5
216 343 512 729 1000