This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
rr test_0_1=mnist_test[,mnist_test[nrow(mnist_test),]==0|mnist_test[nrow(mnist_test),]==1] test_3_5=mnist_test[,mnist_test[nrow(mnist_test),]==3|mnist_test[nrow(mnist_test),]==5] train_0_1=mnist_train[,mnist_train[nrow(mnist_train),]==0|mnist_train[nrow(mnist_train),]==1] train_3_5=mnist_train[,mnist_train[nrow(mnist_train),]==3|mnist_train[nrow(mnist_train),]==5]
rr true_label_test_0_1=test_0_1[nrow(test_0_1),] true_label_test_3_5=test_3_5[nrow(test_3_5),] true_label_train_0_1=train_0_1[nrow(train_0_1),] true_label_train_3_5=train_3_5[nrow(train_3_5),]
rr test_0_1=test_0_1[-nrow(test_0_1),] test_3_5=test_3_5[-nrow(test_3_5),] train_0_1=train_0_1[-nrow(train_0_1),] train_3_5=train_3_5[-nrow(train_3_5),]
rr image((1:28), (1:28), matrix(data=as.numeric(train_0_1[,true_label_train_0_1[1,]==0][,1]), nrow=sqrt(nrow(train_0_1)), ncol=sqrt(nrow(train_0_1))), col=gray.colors(256))

rr image((1:28), (1:28), matrix(data=as.numeric(train_0_1[,true_label_train_0_1[1,]==1][,1]), nrow=sqrt(nrow(train_0_1)), ncol=sqrt(nrow(train_0_1))), col=gray.colors(256))

rr image((1:28), (1:28), matrix(data=as.numeric(train_3_5[,true_label_train_3_5[1,]==3][,1]), nrow=sqrt(nrow(train_3_5)), ncol=sqrt(nrow(train_3_5))), col=gray.colors(256))

rr image((1:28), (1:28), matrix(data=as.numeric(train_3_5[,true_label_train_3_5[1,]==5][,1]), nrow=sqrt(nrow(train_3_5)), ncol=sqrt(nrow(train_3_5))), col=gray.colors(256))

- Theory
- Formula for the gradient of loss function in Logistic Regression Below is the formula for gradient descent process. \[\theta_j := \theta_j - \alpha \frac {1}{m}\sum\limits_{i = 1}^m (\frac {1}{1 + \exp(- {\theta} ^\intercal x^i)} -y^i)x_j^i\] In the formula: \(\theta_j'\)is the updated value of the parameter \(\theta_j\) for the cost function.
\(\theta_j\) is one of the parameter in the parameter vector \(\theta\)
\(\theta\) is the parameter vector to be optmized in the cost function
\(\alpha\) is the learning rate
\(m\) is the number of training data
\(x^i\) is one vector of the training data vectors
\(y^i\) is true class/label for the training data vector \(x^i\)
\(x_j^i\) is the x value for parameter \(\theta_j\)
- Pseudocode for training a Logistic Regression model
matrix_x=matrix(data=input_data, row_number=nrow(training_data), column_number=ncol(training_data)) matrix_y=matrix(data=labels, row_number=nrow(training_data), column_number=1) matrix_theta=matrix(data=initial_theta, row_number=ncol(training_data), column_number=1)
gradient_function=function(matrix_x, matrix_y,matrix_theta){ theta=matrix_theta-alpha(1/nrow(training_data))transpose(matrix_x)%%(1/(1+exp(-matrix_x%*%matrix_theta))-matrix_y) return(theta) }
sigmoid_function=function(z){ g=1/(1+exp(-z)) return(g) }
cost_function=function(matrix_theta){ g=sigmoid_function(matrix_x%%matrix_theta) cost_J=((transpose(-matrix_y)%%log(g))-(transpose(1-matrix_y)%*%log(1-g))) return(cost_J) }
while(J>threshold){ matrix_theta=gradient_function(matrix_x, matrix_y, matrix_theta) J=cost_function(matrix_theta) }
- Calculate the number of operations per gradient descent iteration
- Implementation
sigmoid=function(z){
g=1/(1+exp(-z))
return(g)
}
logit_model=function(input_data, class_data, theta_initial, alpha,epsilon ){
if(length(unique(unlist(class_data)))==2){
init=0
for(i in unique(unlist(class_data)) ){
if(!(i%in%c(0,1))) {
class_data[class_data==i]=init
init=init+1
}}
}
x=t(as.matrix(input_data))
y=t(as.matrix(class_data))
theta=matrix(data=rep(theta_initial, ncol(x)), nrow=ncol(x), ncol=1)
J=epsilon+1
while(J>epsilon)
{
theta=theta-alpha*(t(x)%*%(1/(1+exp(-x%*%theta))-y)/(nrow(x)))
J=(t(-y)%*%log(sigmoid(x%*%theta))-t(1-y)%*%log(1-sigmoid(x%*%theta)))
print(J)
}
return(theta)
}
model_test=function(theta, training_input_data, training_class_data, test_input_data, test_class_data){
if(length(unique(unlist(test_class_data)))==2){
init=0
for(i in unique(unlist(test_class_data)) ){
if(!(i%in%c(0,1))) {
test_class_data[test_class_data==i]=init
init=init+1
}}
}
if(length(unique(unlist(training_class_data)))==2){
init=0
for(i in unique(unlist(training_class_data)) ){
if(!(i%in%c(0,1))) {
training_class_data[training_class_data==i]=init
init=init+1
}}
}
prediction_train=round(sigmoid(t(as.matrix(training_input_data))%*%theta))
error_train=sum(abs(prediction_train-t(as.matrix(training_class_data))))/nrow(t(as.matrix(training_class_data)))
prediction_test=round(sigmoid(t(as.matrix(test_input_data))%*%theta))
error_test=sum(abs(prediction_test-t(as.matrix(test_class_data))))/nrow(t(as.matrix(test_class_data)))
print(cat("Accuracy of Training Set: ", 1-error_train))
print(cat("Accuracy of Test Set: ", 1-error_test))
return(c(1-error_train, 1-error_test))
}
theta_0_1=logit_model(train_0_1,true_label_train_0_1, 0.1,2, 50)
rr model_test(theta, train_0_1, true_label_train_0_1, test_0_1, true_label_test_0_1)
Accuracy of Training Set: 0.9987367NULL
Accuracy of Test Set: 0.9995272NULL
theta_3_5=logit_model(train_3_5,true_label_train_3_5, 0.1 ,2, 70)
model_test(theta_3_5, train_3_5, true_label_train_3_5, test_3_5, new_label_test_3_5)
sample_and_shuffle=function(data, sample_perc){
new_data=sample(data, sample_perc*(ncol(data)), replace=FALSE)
return(new_data)
}
repeat_modeling=function(training_data, test_data, class,repeat_time, size,size_step, initial_theta, alpha, cost){
result=c()
for(i in c(1:repeat_time)){
spsf_train=sample_and_shuffle(training_data,size)
if(identical(unlist(class),unlist(c(3,5)))){
spsf_train=spsf_train[,spsf_train[nrow(spsf_train),]==3|spsf_train[nrow(spsf_train),]==5]
true_label_spsf_train=spsf_train[nrow(spsf_train),]
spsf_train=spsf_train[-nrow(spsf_train),]
spsf_test=test_data
spsf_test=spsf_test[,spsf_test[nrow(spsf_test),]==3|spsf_test[nrow(spsf_test),]==5]
true_label_spsf_test=spsf_test[nrow(spsf_test),]
spsf_test=spsf_test[-nrow(spsf_test),]
}
else if(identical(unlist(class),unlist(c(0,1)))){
spsf_train=spsf_train[,spsf_train[nrow(spsf_train),]==0|spsf_train[nrow(spsf_train),]==1]
true_label_spsf_train=spsf_train[nrow(spsf_train),]
spsf_train=spsf_train[-nrow(spsf_train),]
spsf_test=test_data
spsf_test=spsf_test[,spsf_test[nrow(spsf_test),]==0|spsf_test[nrow(spsf_test),]==1]
true_label_spsf_test=spsf_test[nrow(spsf_test),]
spsf_test=spsf_test[-nrow(spsf_test),]
} else{ print("Wrong Class")}
print(nrow(t(spsf_train)))
print(nrow(t(true_label_spsf_train)))
print(nrow(t(spsf_test)))
print(nrow(t(true_label_spsf_test)))
spsf_theta=logit_model(spsf_train,true_label_spsf_train, initial_theta,alpha, cost)
accuracy=model_test(spsf_theta, spsf_train,true_label_spsf_train, spsf_test, true_label_spsf_test)
result=rbind(result,c(i, size, initial_theta, alpha, cost, accuracy[1],accuracy[2]))
size=size-size_step
}
return(result)
}
- Training
Training 2 models: train_0_1 train_3_5
Repeat for 10 times
ten_times_0_1=repeat_modeling(mnist_train, mnist_test, c(0,1),10, 1,0.1, 0.1, 0.8, 100)
ten_times_3_5=repeat_modeling(mnist_train, mnist_test, c(3,5),10, 1,0.1, 0.1, 1.4, 1000)
Explain the difference
Explain what to do with multiple classes
- Evaluation
Experiment with different initializations
Experiment with different convergence criteria for gradient descent
- Learning Curves
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
---
title: "Homework 3"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 

```{r}
mnist_train=read.csv("mnist_train.csv", header=FALSE)
mnist_test=read.csv("mnist_test.csv", header=FALSE)
```

```{r}
test_0_1=mnist_test[,mnist_test[nrow(mnist_test),]==0|mnist_test[nrow(mnist_test),]==1]
test_3_5=mnist_test[,mnist_test[nrow(mnist_test),]==3|mnist_test[nrow(mnist_test),]==5]
train_0_1=mnist_train[,mnist_train[nrow(mnist_train),]==0|mnist_train[nrow(mnist_train),]==1]
train_3_5=mnist_train[,mnist_train[nrow(mnist_train),]==3|mnist_train[nrow(mnist_train),]==5]
```

```{r}
true_label_test_0_1=test_0_1[nrow(test_0_1),]
true_label_test_3_5=test_3_5[nrow(test_3_5),]
true_label_train_0_1=train_0_1[nrow(train_0_1),]
true_label_train_3_5=train_3_5[nrow(train_3_5),]
```

```{r}
test_0_1=test_0_1[-nrow(test_0_1),]
test_3_5=test_3_5[-nrow(test_3_5),]
train_0_1=train_0_1[-nrow(train_0_1),]
train_3_5=train_3_5[-nrow(train_3_5),]
```

```{r}
image((1:28), (1:28), matrix(data=as.numeric(train_0_1[,true_label_train_0_1[1,]==0][,1]), nrow=sqrt(nrow(train_0_1)), ncol=sqrt(nrow(train_0_1))), col=gray.colors(256))
```

```{r}
image((1:28), (1:28), matrix(data=as.numeric(train_0_1[,true_label_train_0_1[1,]==1][,1]), nrow=sqrt(nrow(train_0_1)), ncol=sqrt(nrow(train_0_1))), col=gray.colors(256))
```

```{r}
image((1:28), (1:28), matrix(data=as.numeric(train_3_5[,true_label_train_3_5[1,]==3][,1]), nrow=sqrt(nrow(train_3_5)), ncol=sqrt(nrow(train_3_5))), col=gray.colors(256))
```

```{r}
image((1:28), (1:28), matrix(data=as.numeric(train_3_5[,true_label_train_3_5[1,]==5][,1]), nrow=sqrt(nrow(train_3_5)), ncol=sqrt(nrow(train_3_5))), col=gray.colors(256))
```

1. Theory
a. Formula for the gradient of loss function in Logistic Regression
Below is the formula for gradient descent process.
$$\theta_j := \theta_j - \alpha \frac {1}{m}\sum\limits_{i = 1}^m (\frac {1}{1 ＋ \exp(－ {\theta} ^\intercal x^i)} -y^i)x_j^i$$
In the formula:
   $\theta_j'$is the updated value of the parameter $\theta_j$ for the cost function.

   $\theta_j$ is one of the parameter in the parameter vector $\theta$

   $\theta$ is the parameter vector to be optmized in the cost function

   $\alpha$ is the learning rate

   $m$ is the number of training data

   $x^i$ is one vector of the training data vectors

   $y^i$ is true class/label for the training data vector $x^i$

   $x_j^i$ is the x value for parameter  $\theta_j$





b. Pseudocode for training a Logistic Regression model

   matrix_x=matrix(data=input_data, row_number=nrow(training_data), column_number=ncol(training_data))
   matrix_y=matrix(data=labels, row_number=nrow(training_data), column_number=1)
   matrix_theta=matrix(data=initial_theta, row_number=ncol(training_data), column_number=1)
   
   gradient_function=function(matrix_x, matrix_y,matrix_theta){
   theta=matrix_theta-alpha*(1/nrow(training_data))transpose(matrix_x)%*%(1/(1+exp(-matrix_x%*%matrix_theta))-matrix_y)
   return(theta)
   }
   
   sigmoid_function=function(z){
   g=1/(1+exp(-z))
   return(g)
   }
   
   cost_function=function(matrix_theta){
   g=sigmoid_function(matrix_x%*%matrix_theta)
   cost_J=((transpose(-matrix_y)%*%log(g))-(transpose(1-matrix_y)%*%log(1-g)))
   return(cost_J)
   }

   while(J>threshold){
   matrix_theta=gradient_function(matrix_x, matrix_y, matrix_theta)
   J=cost_function(matrix_theta)
   }


c. Calculate the number of operations per gradient descent iteration

2. Implementation

```{r}
sigmoid=function(z){
  g=1/(1+exp(-z))
  return(g)
}


logit_model=function(input_data, class_data, theta_initial, alpha,epsilon ){
  
  if(length(unique(unlist(class_data)))==2){
    
    init=0
    for(i in unique(unlist(class_data)) ){
      if(!(i%in%c(0,1))) { 
        class_data[class_data==i]=init
        init=init+1
        
      }}  
  }
  
  
  x=t(as.matrix(input_data))
  y=t(as.matrix(class_data))
  theta=matrix(data=rep(theta_initial, ncol(x)), nrow=ncol(x), ncol=1)
  J=epsilon+1
  while(J>epsilon)
  {
    
    theta=theta-alpha*(t(x)%*%(1/(1+exp(-x%*%theta))-y)/(nrow(x)))
    J=(t(-y)%*%log(sigmoid(x%*%theta))-t(1-y)%*%log(1-sigmoid(x%*%theta)))
    print(J)
  }
  return(theta)
}
```

```{r}
model_test=function(theta, training_input_data, training_class_data, test_input_data, test_class_data){
  
  if(length(unique(unlist(test_class_data)))==2){
    
    init=0
    for(i in unique(unlist(test_class_data)) ){
      if(!(i%in%c(0,1))) { 
        test_class_data[test_class_data==i]=init
        init=init+1
        
      }}  
  }
  if(length(unique(unlist(training_class_data)))==2){
    
    init=0
    for(i in unique(unlist(training_class_data)) ){
      if(!(i%in%c(0,1))) { 
        training_class_data[training_class_data==i]=init
        init=init+1
        
      }}  
  }
  
  prediction_train=round(sigmoid(t(as.matrix(training_input_data))%*%theta))
  error_train=sum(abs(prediction_train-t(as.matrix(training_class_data))))/nrow(t(as.matrix(training_class_data)))
  prediction_test=round(sigmoid(t(as.matrix(test_input_data))%*%theta))
  error_test=sum(abs(prediction_test-t(as.matrix(test_class_data))))/nrow(t(as.matrix(test_class_data)))
  
  print(cat("Accuracy of Training Set: ", 1-error_train))
  print(cat("Accuracy of Test Set: ", 1-error_test))
  return(c(1-error_train, 1-error_test))
}

```

```{r}
theta_0_1=logit_model(train_0_1,true_label_train_0_1, 0.1,2, 50)
```

```{r}

```

```{r}
model_test(theta_0_1, train_0_1, true_label_train_0_1, test_0_1, true_label_test_0_1)
```



```{r}


theta_3_5=logit_model(train_3_5,true_label_train_3_5, 0.1 ,2, 70)
```


```{r}
model_test(theta_3_5, train_3_5, true_label_train_3_5, test_3_5, new_label_test_3_5)
```

```{r}

sample_and_shuffle=function(data, sample_perc){
  new_data=sample(data, sample_perc*(ncol(data)), replace=FALSE)
  return(new_data)
}

```






```{r}
repeat_modeling=function(training_data, test_data, class,repeat_time, size,size_step, initial_theta, alpha, cost){

result=c()  
for(i in c(1:repeat_time)){
spsf_train=sample_and_shuffle(training_data,size)
if(identical(unlist(class),unlist(c(3,5)))){
spsf_train=spsf_train[,spsf_train[nrow(spsf_train),]==3|spsf_train[nrow(spsf_train),]==5]
true_label_spsf_train=spsf_train[nrow(spsf_train),]
spsf_train=spsf_train[-nrow(spsf_train),]

spsf_test=test_data
spsf_test=spsf_test[,spsf_test[nrow(spsf_test),]==3|spsf_test[nrow(spsf_test),]==5]
true_label_spsf_test=spsf_test[nrow(spsf_test),]
spsf_test=spsf_test[-nrow(spsf_test),]
}
else if(identical(unlist(class),unlist(c(0,1)))){
spsf_train=spsf_train[,spsf_train[nrow(spsf_train),]==0|spsf_train[nrow(spsf_train),]==1]
true_label_spsf_train=spsf_train[nrow(spsf_train),]
spsf_train=spsf_train[-nrow(spsf_train),]

spsf_test=test_data
spsf_test=spsf_test[,spsf_test[nrow(spsf_test),]==0|spsf_test[nrow(spsf_test),]==1]
true_label_spsf_test=spsf_test[nrow(spsf_test),]
spsf_test=spsf_test[-nrow(spsf_test),]
} else{ print("Wrong Class")}

print(nrow(t(spsf_train)))
print(nrow(t(true_label_spsf_train)))
print(nrow(t(spsf_test)))
print(nrow(t(true_label_spsf_test)))
spsf_theta=logit_model(spsf_train,true_label_spsf_train, initial_theta,alpha, cost)



accuracy=model_test(spsf_theta, spsf_train,true_label_spsf_train, spsf_test, true_label_spsf_test)

result=rbind(result,c(i, size, initial_theta, alpha, cost, accuracy[1],accuracy[2]))

size=size-size_step
}

return(result)  
}

```


3. Training

a. Training 2 models:
train_0_1
train_3_5

b. Repeat for 10 times
```{r}

ten_times_0_1=repeat_modeling(mnist_train, mnist_test, c(0,1),10, 1,0.1, 0.1, 0.8, 100)
ten_times_3_5=repeat_modeling(mnist_train, mnist_test, c(3,5),10, 1,0.1, 0.1, 1.4, 1000)
```

c. Explain the difference

d. Explain what to do with multiple classes

4. Evaluation

a. Experiment with different initializations 

b. Experiment with different convergence criteria for gradient descent

5. Learning Curves

a. 



Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Cmd+Shift+K* to preview the HTML file).
