This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
#### Lazy Learning -Classification using Nearest Neighbors
#This line reads the CSV file located at the specified file path ("C:/Users/Home/Downloads/wisc_bc_data.csv")
#and stores its contents in a data frame named wisc_bc_data.
# Set the CRAN mirror
options(repos = "https://cloud.r-project.org/")
# Install the required package
install.packages("class")
## Installing package into 'C:/Users/Home/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'class' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'class'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Home\AppData\Local\R\win-library\4.3\00LOCK\class\libs\x64\class.dll
## to C:\Users\Home\AppData\Local\R\win-library\4.3\class\libs\x64\class.dll:
## Permission denied
## Warning: restored 'class'
##
## The downloaded binary packages are in
## C:\Users\Home\AppData\Local\Temp\Rtmpg3TKqj\downloaded_packages
library(class)
## Warning: package 'class' was built under R version 4.3.3
#read_csv() is a function from the readr package used to read CSV files.
library(readr)
wisc_bc_data <- read_csv("C:/Users/Home/Downloads/wisc_bc_data.csv")
## Rows: 569 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): diagnosis
## dbl (31): id, radius_mean, texture_mean, perimeter_mean, area_mean, smoothne...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(wisc_bc_data)
#stringAsFactors=FALSE specifies that character columns should not be converted to factors. By default,
#R converts character columns to factors, but setting this argument to FALSE prevents that behavior.
#rename dataset
wd<-wisc_bc_data
#str to look at the data, str=means structure
str(wd)
## spc_tbl_ [569 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:569] 842302 842517 84300903 84348301 84358402 ...
## $ diagnosis : chr [1:569] "M" "M" "M" "M" ...
## $ radius_mean : num [1:569] 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num [1:569] 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num [1:569] 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num [1:569] 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num [1:569] 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num [1:569] 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num [1:569] 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave points_mean : num [1:569] 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num [1:569] 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num [1:569] 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num [1:569] 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num [1:569] 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num [1:569] 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num [1:569] 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num [1:569] 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num [1:569] 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num [1:569] 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave points_se : num [1:569] 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num [1:569] 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num [1:569] 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num [1:569] 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num [1:569] 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num [1:569] 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num [1:569] 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num [1:569] 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num [1:569] 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num [1:569] 0.712 0.242 0.45 0.687 0.4 ...
## $ concave points_worst : num [1:569] 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num [1:569] 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num [1:569] 0.1189 0.089 0.0876 0.173 0.0768 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. diagnosis = col_character(),
## .. radius_mean = col_double(),
## .. texture_mean = col_double(),
## .. perimeter_mean = col_double(),
## .. area_mean = col_double(),
## .. smoothness_mean = col_double(),
## .. compactness_mean = col_double(),
## .. concavity_mean = col_double(),
## .. `concave points_mean` = col_double(),
## .. symmetry_mean = col_double(),
## .. fractal_dimension_mean = col_double(),
## .. radius_se = col_double(),
## .. texture_se = col_double(),
## .. perimeter_se = col_double(),
## .. area_se = col_double(),
## .. smoothness_se = col_double(),
## .. compactness_se = col_double(),
## .. concavity_se = col_double(),
## .. `concave points_se` = col_double(),
## .. symmetry_se = col_double(),
## .. fractal_dimension_se = col_double(),
## .. radius_worst = col_double(),
## .. texture_worst = col_double(),
## .. perimeter_worst = col_double(),
## .. area_worst = col_double(),
## .. smoothness_worst = col_double(),
## .. compactness_worst = col_double(),
## .. concavity_worst = col_double(),
## .. `concave points_worst` = col_double(),
## .. symmetry_worst = col_double(),
## .. fractal_dimension_worst = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#remove the first column
#The code you provided removes the first column from the data frame wd and assigns
#the resulting data frame to wd1
wd1<-wd[,-1]
#wd1$diagnosis: This syntax is used to subset the column named "diagnosis"
#from the data frame wd1
#wd1$: The $ operator is used to access columns within a data frame by name.
#It allows you to refer to a specific column by providing its name after the $ symbol.
#view the table
View(wd1)
wd1$diagnosis
## [1] "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
## [19] "M" "B" "B" "B" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
## [37] "M" "B" "M" "M" "M" "M" "M" "M" "M" "M" "B" "M" "B" "B" "B" "B" "B" "M"
## [55] "M" "B" "M" "M" "B" "B" "B" "B" "M" "B" "M" "M" "B" "B" "B" "B" "M" "B"
## [73] "M" "M" "B" "M" "B" "M" "M" "B" "B" "B" "M" "M" "B" "M" "M" "M" "B" "B"
## [91] "B" "M" "B" "B" "M" "M" "B" "B" "B" "M" "M" "B" "B" "B" "B" "M" "B" "B"
## [109] "M" "B" "B" "B" "B" "B" "B" "B" "B" "M" "M" "M" "B" "M" "M" "B" "B" "B"
## [127] "M" "M" "B" "M" "B" "M" "M" "B" "M" "M" "B" "B" "M" "B" "B" "M" "B" "B"
## [145] "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "M"
## [163] "M" "B" "M" "B" "B" "M" "M" "B" "B" "M" "M" "B" "B" "B" "B" "M" "B" "B"
## [181] "M" "M" "M" "B" "M" "B" "M" "B" "B" "B" "M" "B" "B" "M" "M" "B" "M" "M"
## [199] "M" "M" "B" "M" "M" "M" "B" "M" "B" "M" "B" "B" "M" "B" "M" "M" "M" "M"
## [217] "B" "B" "M" "M" "B" "B" "B" "M" "B" "B" "B" "B" "B" "M" "M" "B" "B" "M"
## [235] "B" "B" "M" "M" "B" "M" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "M" "B"
## [253] "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "B" "B" "B" "B"
## [271] "B" "B" "M" "B" "M" "B" "B" "M" "B" "B" "M" "B" "M" "M" "B" "B" "B" "B"
## [289] "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "B" "B"
## [307] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "M" "B" "M"
## [325] "B" "B" "B" "B" "M" "M" "M" "B" "B" "B" "B" "M" "B" "M" "B" "M" "B" "B"
## [343] "B" "M" "B" "B" "B" "B" "B" "B" "B" "M" "M" "M" "B" "B" "B" "B" "B" "B"
## [361] "B" "B" "B" "B" "B" "M" "M" "B" "M" "M" "M" "B" "M" "M" "B" "B" "B" "B"
## [379] "B" "M" "B" "B" "B" "B" "B" "M" "B" "B" "B" "M" "B" "B" "M" "M" "B" "B"
## [397] "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B"
## [415] "M" "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B"
## [433] "M" "M" "B" "M" "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "B" "M"
## [451] "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "M" "M" "B" "B" "B" "B" "B" "B"
## [469] "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "B"
## [487] "B" "M" "B" "M" "B" "B" "M" "B" "B" "B" "B" "B" "M" "M" "B" "M" "B" "M"
## [505] "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "M" "M" "B" "B" "B" "M"
## [523] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "M" "M" "B" "B" "B"
## [541] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B"
## [559] "B" "B" "B" "B" "M" "M" "M" "M" "M" "M" "B"
table(wd1$diagnosis)
##
## B M
## 357 212
prop.table(table(wd1$diagnosis))
##
## B M
## 0.6274165 0.3725835
summary(wd1[c("radius_mean", "area_mean", "smoothness_mean")])
## radius_mean area_mean smoothness_mean
## Min. : 6.981 Min. : 143.5 Min. :0.05263
## 1st Qu.:11.700 1st Qu.: 420.3 1st Qu.:0.08637
## Median :13.370 Median : 551.1 Median :0.09587
## Mean :14.127 Mean : 654.9 Mean :0.09636
## 3rd Qu.:15.780 3rd Qu.: 782.7 3rd Qu.:0.10530
## Max. :28.110 Max. :2501.0 Max. :0.16340
#create normalization fucntion
normalize<-function(x){
return((x-min(x)/(max(x) - min(x))))
}
normalize(c(1,2,3,4,5))
## [1] 0.75 1.75 2.75 3.75 4.75
#In the context of data normalization, "scaling" refers to the process of
#transforming the values of a variable to a specific range. This transformation
#is often performed to make the values more comparable or to ensure that they fall
#within a specific range that is suitable for analysis or modeling.
#, the function scales the values of the vector c(1, 10, 100, 1000) between 0 and 1,
#proportionally to their position within the original range.
normalize(c(1,10,100,1000))
## [1] 0.998999 9.998999 99.998999 999.998999
#This code performs normalization on the columns of the data frame wd1 from columns 2
#to 31 and stores the normalized values in a new data frame wd_n
wd_n<-as.data.frame(lapply(wd1[2:31],normalize))
summary(wd_n[c("radius_mean","area_mean","smoothness_mean")])
## radius_mean area_mean smoothness_mean
## Min. : 6.651 Min. : 143.4 Min. :-0.4225
## 1st Qu.:11.370 1st Qu.: 420.2 1st Qu.:-0.3888
## Median :13.040 Median : 551.0 Median :-0.3793
## Mean :13.797 Mean : 654.8 Mean :-0.3788
## 3rd Qu.:15.450 3rd Qu.: 782.6 3rd Qu.:-0.3698
## Max. :27.780 Max. :2500.9 Max. :-0.3117
wd1$diagnosis <- factor(wd1$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
str(wd1$diagnosis)
## Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
wd1_train<-wd_n[1:469, ]
wd1_test<-wd_n[470:569, ]
wd1_train_labels<-wd1[1:469, 1]
wd1_test_labels<-wd1[470:569, 1]
head(wd1_test_labels)
## # A tibble: 6 × 1
## diagnosis
## <fct>
## 1 Benign
## 2 Benign
## 3 Benign
## 4 Benign
## 5 Benign
## 6 Benign
sqrt(469)
## [1] 21.65641
install.packages("class")
## Warning: package 'class' is in use and will not be installed
install.packages("class", repos = "https://cloud.r-project.org/")
## Warning: package 'class' is in use and will not be installed
# you have to do this to brin it nto you R environment
library(class)
wd1_test_pred <- knn(train = wd1_train, test = wd1_test, cl = wd1_train_labels$diagnosis, k = 21)
wd1_test_pred
## [1] Benign Benign Benign Malignant Benign Benign Benign
## [8] Benign Benign Benign Malignant Benign Malignant Benign
## [15] Benign Benign Benign Benign Malignant Benign Malignant
## [22] Benign Malignant Malignant Benign Benign Benign Benign
## [29] Benign Malignant Malignant Benign Malignant Benign Malignant
## [36] Benign Benign Benign Benign Malignant Malignant Benign
## [43] Benign Malignant Benign Malignant Benign Malignant Malignant
## [50] Benign Benign Benign Malignant Benign Benign Benign
## [57] Benign Benign Benign Benign Benign Benign Benign
## [64] Benign Malignant Benign Malignant Benign Benign Benign
## [71] Benign Benign Benign Benign Benign Benign Benign
## [78] Benign Benign Benign Benign Benign Benign Benign
## [85] Benign Benign Benign Benign Benign Benign Benign
## [92] Benign Benign Malignant Malignant Malignant Malignant Malignant
## [99] Malignant Benign
## Levels: Benign Malignant
#install gmodels
install.packages("gmodels")
## Installing package into 'C:/Users/Home/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'gmodels' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Home\AppData\Local\Temp\Rtmpg3TKqj\downloaded_packages
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.3.3
CrossTable(x=wd1_test_labels$diagnosis, y=wd1_test_pred, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | wd1_test_pred
## wd1_test_labels$diagnosis | Benign | Malignant | Row Total |
## --------------------------|-----------|-----------|-----------|
## Benign | 73 | 4 | 77 |
## | 0.948 | 0.052 | 0.770 |
## | 0.986 | 0.154 | |
## | 0.730 | 0.040 | |
## --------------------------|-----------|-----------|-----------|
## Malignant | 1 | 22 | 23 |
## | 0.043 | 0.957 | 0.230 |
## | 0.014 | 0.846 | |
## | 0.010 | 0.220 | |
## --------------------------|-----------|-----------|-----------|
## Column Total | 74 | 26 | 100 |
## | 0.740 | 0.260 | |
## --------------------------|-----------|-----------|-----------|
##
##