Learning R-basics

getwd() #see the directory 
## [1] "I:/Varsity DOCU/WM ASDS/Semester 3/WMASDS03 Statistical Infarence/Code"
#setwd() #set the directory 
#######putting values for a variable/ in a array
x<-3
class (x); str(x)
## [1] "numeric"
##  num 3
x=3
str(x) ; class (x)
##  num 3
## [1] "numeric"
######two types of observations: string/factor OR integers/numbers
x<-"factors"
x
## [1] "factors"
class(x); str(x)
## [1] "character"
##  chr "factors"
x1<-1:10; x2<-c(1:10); x3<-c(1,10)
x1; x2; x3
##  [1]  1  2  3  4  5  6  7  8  9 10
##  [1]  1  2  3  4  5  6  7  8  9 10
## [1]  1 10
class(x1); str(x1)
## [1] "integer"
##  int [1:10] 1 2 3 4 5 6 7 8 9 10
as.numeric(x1)
##  [1]  1  2  3  4  5  6  7  8  9 10
class(x1); str(x1)
## [1] "integer"
##  int [1:10] 1 2 3 4 5 6 7 8 9 10
#######4 arithmatic operations+; -; *; /
######square/cube etcsqrt; x^2; x^3

Let’s make a 2 X 3 matrix:

xx<-1:6
mat<-matrix(xx, 2,3) 
mat1<-matrix(xx, 2,3,byrow=TRUE)
class(mat);str(mat)
## [1] "matrix" "array"
##  int [1:2, 1:3] 1 2 3 4 5 6

Let’s make a 2 X 3 matrix:

x1<-1:3;x2<-4:6
mat<-rbind(x1,x2) 
class(mat);str(mat)
## [1] "matrix" "array"
##  int [1:2, 1:3] 1 4 2 5 3 6
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "x1" "x2"
##   ..$ : NULL
x<-1:3
as.matrix(x)
##      [,1]
## [1,]    1
## [2,]    2
## [3,]    3
class(x);str(x)
## [1] "integer"
##  int [1:3] 1 2 3

Let’s consider a buit in data of R: iris

data<-iris
class(data); str(data); dim(data)
## [1] "data.frame"
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## [1] 150   5
#write.table(data,”data.txt”)
summary(data)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
#making histogram for 4 numeric variables separately in separate window
hist(data$Sepal.Length) 

hist(data$Sepal.Width) 

hist(data$Petal.Length) 

hist(data$Petal.Width)

#making histogram for 4 numeric variables separately in same window
par(mfrow=c(2,2))
hist(data$Sepal.Length) 
hist(data$Sepal.Width) 
hist(data$Petal.Length) 
hist(data$Petal.Width)

par(mfrow=c(1,1))
#making histogram for 4 numeric variables separately in same window by LOOPING
ndata <- data[,-5]

par(mfrow=c(2,2))
for(i in 1:ncol(ndata)){
  hist(ndata[,i])
}

par(mfrow=c(1,1))
#making pie chart for the categorical variable
table<-table(iris[,5])
table
## 
##     setosa versicolor  virginica 
##         50         50         50
pie(table,lmain="Pie Chart of Species")
## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "lmain" is not a graphical parameter

## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "lmain" is not a graphical parameter

## Warning in text.default(1.1 * P$x, 1.1 * P$y, labels[i], xpd = TRUE, adj =
## ifelse(P$x < : "lmain" is not a graphical parameter
## Warning in title(main = main, ...): "lmain" is not a graphical parameter

####################3D pie install.packages("plotrix")
#library(plotrix)
#pie3D(table,main="Pie Chart of Species")
plot(table)

prop.table(table)
## 
##     setosa versicolor  virginica 
##  0.3333333  0.3333333  0.3333333

===========================================================================================

Lecture 1

Mu (µ) -> N (Population) [We need to analysis it with systematic way to select the sample size] x_bar -> n (Sample) [Must be represent of N]

µ_hat = x_bar µ_hat - x_bar -> minimum [(R1,R2)-> Interval estimate ]

H0: µ_hat = 3.25 H1: µ_hat != 3.25

Hypothesis is an unproven statement.

NUll means their are no different between population mean and sample mean

Data and objective direct what should we do

Categorical Count Data
Gender Phone call per day

Property of point estimator: * Unbiased * Consistency * Efficiency

@ If data comes from normal distribution it’s have a mean @ When we find the distribution then we go to the next step

Unbiased Condition µ_hat - x_bar -> minimum µ_hat - E(x_bar) = 0; Unbiased [if 0 increase that’s means bias high,if 0 goes negative its means low bias ]

Consistency

Efficiency Variation minimum consistency; at the time of comparison which have low variation

Important Distribution Normal,Binomial,Poision, Chi-Square,T-Distribution, F-Distribution

Finding point estimates:

  • Methods of moments
  • Maximum Likelihood Estimator

============================================================================================

Lecture 2

xx<-1:4
mat<-matrix(xx, 2,2) 
mat
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
dmat <- as.data.frame(mat)
dmat
##   V1 V2
## 1  1  3
## 2  2  4

** If it’s a data frame than we can directly call it, if it’s a matrix then we call data from it’s location. Basically matrix used in Decomposition

=> hist ; Check the distribution of the data

data <- iris
head(data)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
hist(data$Sepal.Length, main="Figure",xlab="Sepal Length", ylab="")

par(mfrow=c(2,2)) #printing 2 row and 2 columns 
for (i in 1:4){
  hist(data[,i])
  
}

par(mfrow=c(1,1))
m <- c()
for (i in 1:4){
  m[i]<-mean(data[,i])
  print(m[i])
}
## [1] 5.843333
## [1] 3.057333
## [1] 3.758
## [1] 1.199333
m <- list()
for (i in 1:4){
  m[i]<-mean(data[,i])
  print(m[i])
}
## [[1]]
## [1] 5.843333
## 
## [[1]]
## [1] 3.057333
## 
## [[1]]
## [1] 3.758
## 
## [[1]]
## [1] 1.199333
  • If we want to store the result one by one then we use list insted of vector

@ For non numeric data

tab <- table(data$Species)
print(tab)
## 
##     setosa versicolor  virginica 
##         50         50         50
prop.table(tab) #proportion of the table
## 
##     setosa versicolor  virginica 
##  0.3333333  0.3333333  0.3333333
  • When there are so many category then we skip the pie chart

  • Moments works with fact

  • For wide application we need to drive equation

  • For well judgment we need to understand data set very well

  • For distribution moments give different types of output

MLE Model What is the chance to happen a event = Probability. How this condition full fill under this condition and how frequently it happen is called MLE

  • Goodness of fit measure model -> Mathematical Background -> Joint probability distribution [WE prefer normal]

f(x) = 1/Π √(2)σ e^-1/2*((x-µ)/σ)^2 ; -α <= x <= +α

L(x)/L(x;µ,σ) [attempts to find the unknown parameters µ,σ function goes maximum]

Example of MLE:

#install.packages("mle.tools")
library(mle.tools)
## Warning: package 'mle.tools' was built under R version 4.2.3
x <- rnorm(n = 100, mean = 0.0, sd = 1.0)
hist(x)

d<-density(x)
plot(d$x,d$y,col=2)

pdf <- quote(1 / (sqrt(2 * pi) * sigma) * exp(-0.5 / sigma ^ 2 * (x - mu) ^ 2))
lpdf <- quote(- log(sigma) - 0.5 / sigma ^ 2 * (x - mu) ^ 2)
mu.hat <- mean(x)
sigma.hat<-sqrt(var(x))
r<-coxsnell.bc(density = pdf, 
               logdensity = lpdf, 
               n = length(x), 
               parms = c("mu", "sigma"), 
               mle = c(mu.hat, sigma.hat), 
               lower = '-Inf', 
               upper = 'Inf')
print(mu.hat)
## [1] 0.02422493
print(r$mle[1])
##         mu 
## 0.02422493
#abline(v=mu.hat,col=4)
#abline(v=r$mle[1],col=1)

=============================================================================================

Third Class 26.05.23

  • Normal distribution are convenient
  • Hypothetical Population Generate Normal Distribution
set.seed(100) #simple random sampling without replacement
rnorm(5)
## [1] -0.50219235  0.13153117 -0.07891709  0.88678481  0.11697127
set.seed(100) #fixed variable
rnorm(5)
## [1] -0.50219235  0.13153117 -0.07891709  0.88678481  0.11697127
pop <- rnorm(10000,10,1) #population distribution
hist(pop) #Less expensive visualization, Very informal judgmental representation 

est1 <- replicate(expr = mean(sample(x=pop,size=5)),n=25000)
est2 <- replicate(expr = mean(sample(x=pop,size=25)),n=25000)
par(mfrow=c(2,2))
hist(est1);hist(est2);hist(pop)

ss <- sample(x=pop,size = 1000) #sample distribution #only taking a sample of size 1000 

Checking the properties of a good estimator in R: An Example

# plot density estimate 
plot(density(ss),col="green",lwd = 2,ylim = c(0, 2),xlab ="estimates",main = "Sampling Distributions of Unbiased Estimators") #here there are no length for x because population range same
# add density estimate for the distribution of the sample 
#mean with n=5 to the plot
lines(density(est1),col="steelblue",lwd = 2)
# add density estimate for the distribution of the sample mean with n=25 to the plot
lines(density(est2),col="red2",lwd = 2)
# add a vertical line at the true parameter
abline(v = 10, lty = 2) #Mu 10 for this that's why we take 10

# add N(10,1)/pop. density to the plot
curve(dnorm(x, mean = 10),lwd = 2,lty = 2,add = T)

############################moments   install.packages('moments')
library(moments)
x<-sample(pop,size=100,replace=T)
# Raw Moments/Central Moments
all.moments(x,order.max=4,central=F)
## [1]     1.00000    10.12434   103.38861  1064.75082 11056.53480
hist(x);qqnorm(x);qqline(x,col="red")

###Interval Estimator
pnorm(0.05)
## [1] 0.5199388
#z.test(x=sample,sigma.x = sqrt(var(sample)),conf.level = 0.95)