This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(readxl)
gpa_iq <- read_excel("C:/Users/alyss/OneDrive/Desktop/gpa_iq.xlsx")
View(gpa_iq)
library(ggplot2)
ggplot(data = gpa_iq, mapping = aes(x = iq, y = gpa)) + geom_point(mapping = aes(color = iq))
gpa_iq1<-subset(gpa_iq, gpa_iq$gender == '1')
gpa_iq2<-subset(gpa_iq, gpa_iq$gender == '2')
ggplot(data = gpa_iq1, mapping = aes(x = iq, y = gpa)) + geom_point(mapping = aes(color = iq))
ggplot(data = gpa_iq2, mapping = aes(x = iq, y = gpa)) + geom_point(mapping = aes(color = iq))
gpa_aver1<- mean(gpa_iq1$gpa)
gpa_aver1
## [1] 7.696548
gpa_aver2<- mean(gpa_iq2$gpa)
gpa_aver2
## [1] 7.281638
iq_average1<-mean(gpa_iq1$iq)
iq_average2<-mean(gpa_iq2$iq)
iq_average1
## [1] 105.8387
iq_average2
## [1] 110.9574
data <- data.frame(
Group = c("Gender 1", "Gender 2"),
Average = c(gpa_aver1, gpa_aver2))
ggplot(data, aes(x = Group, y = Average, fill = Group)) +
geom_col() +
theme_minimal() +
labs(title = "Comparison of Average GPA", x = "Gender", y = "Average") +
scale_fill_manual(values = c("Group 1" = "blue", "Group 2" = "red"))
data2 <- data.frame(
Group = c("Gender 1", "Gender 2"),
Average = c(iq_average1, iq_average2))
ggplot(data2, aes(x = Group, y = Average, fill = Group)) +
geom_col() +
theme_minimal() +
labs(title = "Comparison of Average IQ", x = "Gender", y = "Average") +
scale_fill_manual(values = c("Group 1" = "blue", "Group 2" = "red"))
gpa<- gpa_iq[-1]
View(gpa)
table(gpa$gpa)
##
## 0.53 1.76 2.412 3.408 3.647 3.82 3.936 4 4.643 4.7 4.885 5.062 5.237
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 5.528 5.936 6 6.057 6.173 6.231 6.419 6.938 7.167 7.295 7.333 7.47 7.571
## 1 1 1 2 1 1 1 1 2 2 1 2 1
## 7.585 7.588 7.598 7.643 7.647 7.65 7.822 7.825 7.833 7.882 7.94 7.996 7.998
## 1 1 1 1 2 1 1 1 1 1 1 1 1
## 8 8.167 8.175 8.235 8.292 8.333 8.353 8.714 8.833 8.882 8.938 8.998 9
## 2 2 2 1 1 1 1 1 1 1 1 1 1
## 9.167 9.333 9.348 9.41 9.429 9.5 9.571 9.585 9.648 9.763 9.999 10.14 10.58
## 3 1 1 1 1 2 1 1 1 1 1 1 1
## 10.7 10.76
## 1 1
table(gpa$iq)
##
## 72 74 77 79 86 89 90 91 93 96 97 98 100 102 103 104 105 106 107 108
## 1 1 1 1 1 1 1 1 2 1 2 1 2 2 4 2 3 3 4 1
## 109 110 111 112 113 114 115 116 118 119 120 123 124 126 127 128 130 132 136
## 1 4 4 4 3 4 2 1 2 3 2 2 2 1 2 3 1 1 1
table(gpa$gender)
##
## 1 2
## 31 47
normaliZe<- function(x) {
return ((x-min(x))/(max(x)-min(x)))
}
normaliZe(c(1,2,3,4,5))
## [1] 0.00 0.25 0.50 0.75 1.00
gpa_1<-as.data.frame(lapply(gpa[0:4], normaliZe))
summary(gpa_1)
## obs gpa iq gender
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2188 1st Qu.:0.5619 1st Qu.:0.4844 1st Qu.:0.0000
## Median :0.4659 Median :0.7135 Median :0.5938 Median :1.0000
## Mean :0.4770 Mean :0.6761 Mean :0.5769 Mean :0.6026
## 3rd Qu.:0.7017 3rd Qu.:0.8263 3rd Qu.:0.7109 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
any(is.na(gpa_iq))
## [1] FALSE
gpa<- gpa_iq[-1]
gpa
## # A tibble: 78 × 5
## obs gpa iq gender concept
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 7.94 111 2 67
## 2 2 8.29 107 2 43
## 3 3 4.64 100 2 52
## 4 4 7.47 107 2 66
## 5 5 8.88 114 1 58
## 6 6 7.58 115 2 51
## 7 7 7.65 111 2 71
## 8 8 2.41 97 2 51
## 9 9 6 100 1 49
## 10 10 8.83 112 2 51
## # ℹ 68 more rows
View(gpa)
table(gpa$gpa)
##
## 0.53 1.76 2.412 3.408 3.647 3.82 3.936 4 4.643 4.7 4.885 5.062 5.237
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 5.528 5.936 6 6.057 6.173 6.231 6.419 6.938 7.167 7.295 7.333 7.47 7.571
## 1 1 1 2 1 1 1 1 2 2 1 2 1
## 7.585 7.588 7.598 7.643 7.647 7.65 7.822 7.825 7.833 7.882 7.94 7.996 7.998
## 1 1 1 1 2 1 1 1 1 1 1 1 1
## 8 8.167 8.175 8.235 8.292 8.333 8.353 8.714 8.833 8.882 8.938 8.998 9
## 2 2 2 1 1 1 1 1 1 1 1 1 1
## 9.167 9.333 9.348 9.41 9.429 9.5 9.571 9.585 9.648 9.763 9.999 10.14 10.58
## 3 1 1 1 1 2 1 1 1 1 1 1 1
## 10.7 10.76
## 1 1
gpa$gpa<-factor(gpa$gpa)
str(gpa$gpa)
## Factor w/ 67 levels "0.53","1.76",..: 37 44 9 25 49 27 32 3 16 48 ...
table(gpa$gpa)
##
## 0.53 1.76 2.412 3.408 3.647 3.82 3.936 4 4.643 4.7 4.885 5.062 5.237
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 5.528 5.936 6 6.057 6.173 6.231 6.419 6.938 7.167 7.295 7.333 7.47 7.571
## 1 1 1 2 1 1 1 1 2 2 1 2 1
## 7.585 7.588 7.598 7.643 7.647 7.65 7.822 7.825 7.833 7.882 7.94 7.996 7.998
## 1 1 1 1 2 1 1 1 1 1 1 1 1
## 8 8.167 8.175 8.235 8.292 8.333 8.353 8.714 8.833 8.882 8.938 8.998 9
## 2 2 2 1 1 1 1 1 1 1 1 1 1
## 9.167 9.333 9.348 9.41 9.429 9.5 9.571 9.585 9.648 9.763 9.999 10.14 10.58
## 3 1 1 1 1 2 1 1 1 1 1 1 1
## 10.7 10.76
## 1 1
dist_mat<-dist(gpa_1, method='euclidean')
hclust_avg<-hclust(dist_mat, method='average')
hclust_avg
##
## Call:
## hclust(d = dist_mat, method = "average")
##
## Cluster method : average
## Distance : euclidean
## Number of objects: 78
plot(hclust_avg)
cut_avg<-cutree(hclust_avg, k=2)
rect.hclust(hclust_avg, k=2, border=2.6)
abline(h=0.7,col='red')
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
gpa_df_cl<-mutate(gpa,cluster=cut_avg)
count(gpa_df_cl,cluster)
## # A tibble: 2 × 2
## cluster n
## <int> <int>
## 1 1 47
## 2 2 31
library(ggplot2)
ggplot(gpa_df_cl,aes(x=iq, y = gpa,color = factor(cluster))) +geom_point()
library("clValid")
## Loading required package: cluster
library(cluster)
?dunn
## starting httpd help server ...
## done
dunn(dist_mat,cut_avg)
## [1] 0.8841589
dist_mat<-dist(gpa_1, method='euclidean')
hclust_avg<-hclust(dist_mat, method='average')
hclust_avg
##
## Call:
## hclust(d = dist_mat, method = "average")
##
## Cluster method : average
## Distance : euclidean
## Number of objects: 78
plot(hclust_avg)
cut_avg<-cutree(hclust_avg, k=3)
rect.hclust(hclust_avg, k=3, border=2.6)
abline(h=0.7,col='red')
library(dendextend)
avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)
library(dplyr)
gpa_df_cl<-mutate(gpa,cluster=cut_avg)
count(gpa_df_cl,cluster)
## # A tibble: 3 × 2
## cluster n
## <int> <int>
## 1 1 37
## 2 2 10
## 3 3 31
library(ggplot2)
ggplot(gpa_df_cl,aes(x=iq, y = gpa,color = factor(cluster))) +geom_point()
library("clValid")
library(cluster)
?dunn
dunn(dist_mat,cut_avg)
## [1] 0.2465046
dist_mat<-dist(gpa_1, method='euclidean')
hclust_avg<-hclust(dist_mat, method='average')
hclust_avg
##
## Call:
## hclust(d = dist_mat, method = "average")
##
## Cluster method : average
## Distance : euclidean
## Number of objects: 78
plot(hclust_avg)
cut_avg<-cutree(hclust_avg, k=4)
rect.hclust(hclust_avg, k=4, border=2.6)
abline(h=0.7,col='red')
library(dendextend)
avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)
library(dplyr)
gpa_df_cl<-mutate(gpa,cluster=cut_avg)
count(gpa_df_cl,cluster)
## # A tibble: 4 × 2
## cluster n
## <int> <int>
## 1 1 37
## 2 2 10
## 3 3 18
## 4 4 13
library(ggplot2)
ggplot(gpa_df_cl,aes(x=iq, y = gpa,color = factor(cluster))) +geom_point()
library("clValid")
library(cluster)
?dunn
dunn(dist_mat,cut_avg)
## [1] 0.2591645
kmeans(gpa_1, centers = 2, iter.max = 10, nstart = 1)
## K-means clustering with 2 clusters of sizes 47, 31
##
## Cluster means:
## obs gpa iq gender
## 1 0.4721954 0.6599842 0.6087101 1
## 2 0.4842375 0.7005424 0.5287298 0
##
## Clustering vector:
## [1] 1 1 1 1 2 1 1 1 2 1 2 2 1 2 2 2 2 2 1 2 1 1 2 1 2 1 1 1 1 2 2 2 1 1 2 1 1 2
## [39] 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 2 1 2 1 1 2 2 1 2 1 2 2 2 1
## [77] 2 1
##
## Within cluster sum of squares by cluster:
## [1] 7.686896 5.329421
## (between_SS / total_SS = 59.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
set.seed(78)
km.res <- kmeans(gpa_1, 2, nstart = 25)
print(km.res)
## K-means clustering with 2 clusters of sizes 47, 31
##
## Cluster means:
## obs gpa iq gender
## 1 0.4721954 0.6599842 0.6087101 1
## 2 0.4842375 0.7005424 0.5287298 0
##
## Clustering vector:
## [1] 1 1 1 1 2 1 1 1 2 1 2 2 1 2 2 2 2 2 1 2 1 1 2 1 2 1 1 1 1 2 2 2 1 1 2 1 1 2
## [39] 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 2 1 2 1 1 2 2 1 2 1 2 2 2 1
## [77] 2 1
##
## Within cluster sum of squares by cluster:
## [1] 7.686896 5.329421
## (between_SS / total_SS = 59.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
kmeans(gpa_1, centers = 3, iter.max = 10, nstart = 1)
## K-means clustering with 3 clusters of sizes 13, 18, 47
##
## Cluster means:
## obs gpa iq gender
## 1 0.8164336 0.6635161 0.4122596 0
## 2 0.2443182 0.7272836 0.6128472 0
## 3 0.4721954 0.6599842 0.6087101 1
##
## Clustering vector:
## [1] 3 3 3 3 2 3 3 3 2 3 2 2 3 2 2 2 2 2 3 2 3 3 2 3 2 3 3 3 3 2 2 2 3 3 2 3 3 2
## [39] 2 3 3 3 3 3 3 3 1 3 3 3 3 3 3 1 1 1 3 3 3 3 3 3 1 3 1 3 3 1 1 3 1 3 1 1 1 3
## [77] 1 3
##
## Within cluster sum of squares by cluster:
## [1] 1.148926 1.375380 7.686896
## (between_SS / total_SS = 67.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
set.seed(78)
km.res <- kmeans(gpa_1, 3, nstart = 25)
print(km.res)
## K-means clustering with 3 clusters of sizes 27, 20, 31
##
## Cluster means:
## obs gpa iq gender
## 1 0.2752525 0.7233047 0.6805556 1
## 2 0.7380682 0.5745015 0.5117188 1
## 3 0.4842375 0.7005424 0.5287298 0
##
## Clustering vector:
## [1] 1 1 1 1 3 1 1 1 3 1 3 3 1 3 3 3 3 3 1 3 1 1 3 1 3 1 1 1 1 3 3 3 1 1 3 1 1 3
## [39] 3 1 1 1 1 1 1 2 3 2 2 2 2 2 2 3 3 3 2 2 2 2 2 2 3 2 3 2 2 3 3 2 3 2 3 3 3 2
## [77] 3 2
##
## Within cluster sum of squares by cluster:
## [1] 2.734232 1.909746 5.329421
## (between_SS / total_SS = 68.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
kmeans(gpa_1, centers = 4, iter.max = 10, nstart = 1)
## K-means clustering with 4 clusters of sizes 11, 13, 18, 36
##
## Cluster means:
## obs gpa iq gender
## 1 0.4948347 0.3213365 0.3707386 1
## 2 0.8164336 0.6635161 0.4122596 0
## 3 0.2443182 0.7272836 0.6128472 0
## 4 0.4652778 0.7634599 0.6814236 1
##
## Clustering vector:
## [1] 4 4 1 4 3 4 4 1 3 4 3 3 4 3 3 3 3 3 1 3 4 1 3 4 3 4 4 4 4 3 3 3 4 4 3 4 4 3
## [39] 3 4 4 4 4 4 4 1 2 1 4 4 1 4 1 2 2 2 4 1 4 4 4 4 2 4 2 4 4 2 2 1 2 1 2 2 2 4
## [77] 2 4
##
## Within cluster sum of squares by cluster:
## [1] 1.545389 1.148926 1.375380 3.673906
## (between_SS / total_SS = 75.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
set.seed(78)
km.res <- kmeans(gpa_1, 4, nstart = 25)
print(km.res)
## K-means clustering with 4 clusters of sizes 27, 13, 20, 18
##
## Cluster means:
## obs gpa iq gender
## 1 0.2752525 0.7233047 0.6805556 1
## 2 0.8164336 0.6635161 0.4122596 0
## 3 0.7380682 0.5745015 0.5117188 1
## 4 0.2443182 0.7272836 0.6128472 0
##
## Clustering vector:
## [1] 1 1 1 1 4 1 1 1 4 1 4 4 1 4 4 4 4 4 1 4 1 1 4 1 4 1 1 1 1 4 4 4 1 1 4 1 1 4
## [39] 4 1 1 1 1 1 1 3 2 3 3 3 3 3 3 2 2 2 3 3 3 3 3 3 2 3 2 3 3 2 2 3 2 3 2 2 2 3
## [77] 2 3
##
## Within cluster sum of squares by cluster:
## [1] 2.734232 1.148926 1.909746 1.375380
## (between_SS / total_SS = 77.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
kmeans(gpa_1, centers = 5, iter.max = 10, nstart = 1)
## K-means clustering with 5 clusters of sizes 14, 10, 23, 18, 13
##
## Cluster means:
## obs gpa iq gender
## 1 0.7500000 0.6901480 0.6015625 1
## 2 0.4806818 0.2873412 0.3750000 1
## 3 0.2994071 0.8036423 0.7146739 1
## 4 0.2443182 0.7272836 0.6128472 0
## 5 0.8164336 0.6635161 0.4122596 0
##
## Clustering vector:
## [1] 3 3 2 3 4 3 3 2 4 3 4 4 3 4 4 4 4 4 2 4 3 2 4 3 4 3 3 3 3 4 4 4 3 3 4 3 3 4
## [39] 4 3 3 3 3 3 3 2 5 2 1 1 2 1 1 5 5 5 1 2 1 1 1 1 5 1 5 1 1 5 5 2 5 2 5 5 5 1
## [77] 5 1
##
## Within cluster sum of squares by cluster:
## [1] 0.4876544 1.3942328 1.3559469 1.3753804 1.1489260
## (between_SS / total_SS = 81.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
set.seed(78)
km.res <- kmeans(gpa_1, 5, nstart = 25)
print(km.res)
## K-means clustering with 5 clusters of sizes 18, 18, 17, 12, 13
##
## Cluster means:
## obs gpa iq gender
## 1 0.45770202 0.8327034 0.7760417 1
## 2 0.24431818 0.7272836 0.6128472 0
## 3 0.75133690 0.5460181 0.4678309 1
## 4 0.09848485 0.5623574 0.5572917 1
## 5 0.81643357 0.6635161 0.4122596 0
##
## Clustering vector:
## [1] 4 4 4 4 2 4 4 4 2 4 2 2 4 2 2 2 2 2 4 2 4 4 2 1 2 1 1 1 1 2 2 2 1 1 2 1 1 2
## [39] 2 1 1 1 1 1 1 3 5 3 3 1 3 3 3 5 5 5 1 3 1 3 3 3 5 3 5 3 3 5 5 3 5 3 5 5 5 3
## [77] 5 3
##
## Within cluster sum of squares by cluster:
## [1] 0.5985039 1.3753804 1.5690878 0.7697003 1.1489260
## (between_SS / total_SS = 82.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
kmeans(gpa_1, centers = 6, iter.max = 10, nstart = 1)
## K-means clustering with 6 clusters of sizes 8, 17, 13, 10, 12, 18
##
## Cluster means:
## obs gpa iq gender
## 1 0.16335227 0.5776760 0.4414062 0
## 2 0.75133690 0.5460181 0.4678309 1
## 3 0.81643357 0.6635161 0.4122596 0
## 4 0.30909091 0.8469697 0.7500000 0
## 5 0.09848485 0.5623574 0.5572917 1
## 6 0.45770202 0.8327034 0.7760417 1
##
## Clustering vector:
## [1] 5 5 5 5 4 5 5 5 1 5 1 1 5 1 1 4 4 1 5 1 5 5 1 6 4 6 6 6 6 4 4 4 6 6 4 6 6 4
## [39] 4 6 6 6 6 6 6 2 3 2 2 6 2 2 2 3 3 3 6 2 6 2 2 2 3 2 3 2 2 3 3 2 3 2 3 3 3 2
## [77] 3 2
##
## Within cluster sum of squares by cluster:
## [1] 0.1750368 1.5690878 1.1489260 0.3603926 0.7697003 0.5985039
## (between_SS / total_SS = 85.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
set.seed(78)
km.res <- kmeans(gpa_1, 6, nstart = 25)
print(km.res)
## K-means clustering with 6 clusters of sizes 6, 18, 15, 13, 14, 12
##
## Cluster means:
## obs gpa iq gender
## 1 0.71022727 0.3046595 0.3020833 1
## 2 0.24431818 0.7272836 0.6128472 0
## 3 0.41666667 0.8520626 0.7791667 1
## 4 0.81643357 0.6635161 0.4122596 0
## 5 0.75000000 0.6901480 0.6015625 1
## 6 0.09848485 0.5623574 0.5572917 1
##
## Clustering vector:
## [1] 6 6 6 6 2 6 6 6 2 6 2 2 6 2 2 2 2 2 6 2 6 6 2 3 2 3 3 3 3 2 2 2 3 3 2 3 3 2
## [39] 2 3 3 3 3 3 3 1 4 1 5 5 1 5 5 4 4 4 5 1 5 5 5 5 4 5 4 5 5 4 4 1 4 1 4 4 4 5
## [77] 4 5
##
## Within cluster sum of squares by cluster:
## [1] 0.4146329 1.3753804 0.4018837 1.1489260 0.4876544 0.7697003
## (between_SS / total_SS = 85.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
gpa_dist<- dist(gpa_1, method = "euclidean")
gpaCluster<- hclust(gpa_dist, method = "ward.D")
plot(gpaCluster)
clusterGroup <- cutree(gpaCluster, k = 4)
tapply(gpa_1$gpa, clusterGroup, mean)
## 1 2 3 4
## 0.7175367 0.7272836 0.5945837 0.6635161
plot(clusterGroup)
library(cluster)
s <- silhouette(clusterGroup, gpa_dist)
plot(s)
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.