This is a code draft.
df <- read.csv("data.csv")
df$Group <- as.factor(df$Group)
ncol(df)
## [1] 3048
nrow(df)
## [1] 1600
#import lookup table
lookup <- read.csv("lookup.csv")
lookup$Group <- as.factor(lookup$Group)
lookup[1,2] <- "Bangor"
lookup <- lookup[,1:2]
#inner join
df.join <- dplyr::inner_join(df, lookup, by = "Group")
nrow(df.join) #738
## [1] 738
df.scale.p <- df.join[,3:3048]
df.s <- df.scale.p*1000000
#df.join["Group.Name"]
df.m <- bind_cols(df.join["Group.Name"], df.s)
names(df.m)[1] <- "Group"
#df.m$Group
df.m <- df.m[-c(677:678),] #delete the two observation for a one group
#examples for training testing partitioning
training.samples <- df.m$Group %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- df.m[training.samples,]
test.data <- df.m[-training.samples,]
##### we do not need this lines
# Estimate preprocessing parameters
preproc.param <- train.data %>%
preProcess(method = c("center", "scale"))
# Transform the data using the estimated parameters
train.transformed <- preproc.param %>% predict(train.data)
test.transformed <- preproc.param %>% predict(test.data)
# Fit the model DFA
#use subset3 for deleting the outlier two groups
#run this again after delete the outlier
model <- lda(Group~., data = train.transformed)
## Warning in lda.default(x, grouping, ...): variables are collinear
class(model)
## [1] "lda"
# Make predictions
class(test.transformed)
## [1] "data.frame"
predictions <- model %>% predict(test.transformed)
#predictions$posterior
p.group <- predictions$class
p.test <- cbind(p.group, test.data)
#write.csv(p.test,"p.test.csv")
# Model accuracy
#predictions$class == test.transformed$Group
mean(predictions$class==test.transformed$Group)
## [1] 0.6780822
#plot(model)#cluttered dont run
#predict(model)$x
#plot
lda.data <- cbind(train.transformed, predict(model)$x)
#using the below to subset the outlier
ggplot(lda.data, aes(LD1, LD2)) +
geom_point(aes(color = Group))
#subset
train.transformed.subset1 <- filter(lda.data, LD1<40)
train.transformed.subset2 <- filter(train.transformed.subset1, LD2>-10)
train.transformed.subset3 <- train.transformed.subset2[,1:3047]
#nrow(train.transformed)
#nrow(train.transformed.subset3) #deleted total 64 counts
## replace train.transformed with subset 3 (removed outlier)
#this is after outlier removed
train.transformed <- train.transformed.subset3
model <- lda(Group~., data = train.transformed)
## Warning in lda.default(x, grouping, ...): variables are collinear
#predictions$class == test.transformed$Group
predictions <- model %>% predict(test.transformed)
lda.data <- cbind(train.transformed, predict(model)$x)
mean(predictions$class==test.transformed$Group)
## [1] 0.6164384
######
ggplot(lda.data, aes(LD1, LD2)) +
geom_point(aes(color = Group))
datPred<-data.frame(Group = predict(model)$class,
predict(model)$x)
dat_ell <- data.frame()
for(g in levels(datPred$Group)){
dat_ell <- rbind(dat_ell,
cbind(as.data.frame(with(datPred[datPred$Group == g,],
ellipse(cor(LD1, LD2), centre=c(mean(LD1),mean(LD2))))),
Group=g))}
ggplot(datPred, aes(x=LD1, y=LD2, col=Group) ) +
geom_point( size = 0.5, aes(color = Group))+theme_bw()+
geom_path(data=dat_ell,aes(x=x,y=y,color=Group),size=0.3,linetype=1)
### LASSO
# use x.train & y.train as vector
x.train <- data.matrix(train.transformed[,2:3047])
y.train <- train.transformed$Group
unique(y.train) #total 12 classes
## [1] "Bangor" "Brassfield"
## [3] "Burlington" "Upper Complex Gravels"
## [5] "Fort Payne" "Upper St. Louis"
## [7] "Dover (Lower St. Louis)" "Novaculite"
## [9] "Bigby Cannon" "Ste. Genevieve"
## [11] "Tuscaloosa Gravel" "Warsaw"
#LASSO here
fit.lasso <- glmnet(x.train, y.train, family = "multinomial",
type.multinomial = "grouped")
plot(fit.lasso, xvar = "lambda", label = TRUE, type.coef = "2norm")
cvfit=cv.glmnet(x.train, y.train, family="multinomial",
type.multinomial = "grouped", nfolds = 5)
plot(cvfit)
lambda.min <- cvfit$lambda.min
fit.lasso.best <- glmnet(x.train, y.train, family = "multinomial",
type.multinomial = "grouped", lambda = lambda.min)
a = coef(fit.lasso.best)
b = as.matrix(a)
tmp_coeffs <- coef.glmnet(fit.lasso, lambda.min)
#tmp_coeffs <- coef.glmnet(fit.lasso)
df.coef <- data.frame(name = tmp_coeffs[[1]]@Dimnames[[1]][tmp_coeffs[[1]]@i + 1],
coefficient = tmp_coeffs[[1]]@x)
write.csv(df.coef,"dfCoef.csv")