R code for DFA and LASSO

This is a code draft.

df <- read.csv("data.csv")
df$Group <- as.factor(df$Group)
ncol(df)
## [1] 3048
nrow(df)
## [1] 1600
#import lookup table
lookup <- read.csv("lookup.csv")
lookup$Group <- as.factor(lookup$Group)

lookup[1,2] <- "Bangor"
lookup <- lookup[,1:2]

#inner join 
df.join <- dplyr::inner_join(df, lookup, by = "Group")
nrow(df.join) #738
## [1] 738
df.scale.p <- df.join[,3:3048]
df.s <- df.scale.p*1000000

#df.join["Group.Name"]

df.m <- bind_cols(df.join["Group.Name"], df.s)

names(df.m)[1] <- "Group"
#df.m$Group
df.m <- df.m[-c(677:678),] #delete the two observation for a one group 

#examples for training testing partitioning
training.samples <- df.m$Group %>%
  createDataPartition(p = 0.8, list = FALSE)

train.data <- df.m[training.samples,]
test.data <- df.m[-training.samples,]


##### we do not need this lines
# Estimate preprocessing parameters
preproc.param <- train.data %>% 
  preProcess(method = c("center", "scale"))
# Transform the data using the estimated parameters
train.transformed <- preproc.param %>% predict(train.data)
test.transformed <- preproc.param %>% predict(test.data)


# Fit the model DFA
#use subset3 for deleting the outlier two groups 
#run this again after delete the outlier
model <- lda(Group~., data = train.transformed) 
## Warning in lda.default(x, grouping, ...): variables are collinear
class(model)
## [1] "lda"
# Make predictions
class(test.transformed)
## [1] "data.frame"
predictions <- model %>% predict(test.transformed)
#predictions$posterior
p.group <- predictions$class
p.test <- cbind(p.group, test.data)
#write.csv(p.test,"p.test.csv")

# Model accuracy
#predictions$class == test.transformed$Group
mean(predictions$class==test.transformed$Group)
## [1] 0.6780822
#plot(model)#cluttered dont run
#predict(model)$x

#plot
lda.data <- cbind(train.transformed, predict(model)$x)

#using the below to subset the outlier
ggplot(lda.data, aes(LD1, LD2)) +
  geom_point(aes(color = Group))

#subset
train.transformed.subset1 <- filter(lda.data, LD1<40)
train.transformed.subset2 <- filter(train.transformed.subset1, LD2>-10)
train.transformed.subset3 <- train.transformed.subset2[,1:3047]

#nrow(train.transformed)
#nrow(train.transformed.subset3) #deleted total 64 counts
## replace train.transformed with subset 3 (removed outlier)

#this is after outlier removed

train.transformed <- train.transformed.subset3
model <- lda(Group~., data = train.transformed) 
## Warning in lda.default(x, grouping, ...): variables are collinear
#predictions$class == test.transformed$Group
predictions <- model %>% predict(test.transformed)
lda.data <- cbind(train.transformed, predict(model)$x)
mean(predictions$class==test.transformed$Group)
## [1] 0.6164384
######
ggplot(lda.data, aes(LD1, LD2)) +
  geom_point(aes(color = Group))

datPred<-data.frame(Group = predict(model)$class,
                    predict(model)$x)

dat_ell <- data.frame()

for(g in levels(datPred$Group)){
  dat_ell <- rbind(dat_ell, 
              cbind(as.data.frame(with(datPred[datPred$Group == g,], 
              ellipse(cor(LD1, LD2), centre=c(mean(LD1),mean(LD2))))),
              Group=g))}

ggplot(datPred, aes(x=LD1, y=LD2, col=Group) ) + 
  geom_point( size = 0.5, aes(color = Group))+theme_bw()+
  geom_path(data=dat_ell,aes(x=x,y=y,color=Group),size=0.3,linetype=1) 

### LASSO
# use x.train & y.train as vector
x.train <- data.matrix(train.transformed[,2:3047])
y.train <- train.transformed$Group
unique(y.train) #total 12 classes
##  [1] "Bangor"                  "Brassfield"             
##  [3] "Burlington"              "Upper Complex Gravels"  
##  [5] "Fort Payne"              "Upper St. Louis"        
##  [7] "Dover (Lower St. Louis)" "Novaculite"             
##  [9] "Bigby Cannon"            "Ste. Genevieve"         
## [11] "Tuscaloosa Gravel"       "Warsaw"
#LASSO here
fit.lasso <- glmnet(x.train, y.train, family = "multinomial", 
                    type.multinomial = "grouped")

plot(fit.lasso, xvar = "lambda", label = TRUE, type.coef = "2norm")

cvfit=cv.glmnet(x.train, y.train, family="multinomial", 
                type.multinomial = "grouped", nfolds = 5)

plot(cvfit)

lambda.min <- cvfit$lambda.min
fit.lasso.best <- glmnet(x.train, y.train, family = "multinomial", 
                         type.multinomial = "grouped", lambda = lambda.min)

a = coef(fit.lasso.best)
b = as.matrix(a)

tmp_coeffs <- coef.glmnet(fit.lasso, lambda.min)

#tmp_coeffs <- coef.glmnet(fit.lasso)
df.coef <- data.frame(name = tmp_coeffs[[1]]@Dimnames[[1]][tmp_coeffs[[1]]@i + 1], 
                      coefficient = tmp_coeffs[[1]]@x)
write.csv(df.coef,"dfCoef.csv")