Untitled

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

#### Lazy Learning -Classification using Nearest Neighbors
#This line reads the CSV file located at the specified file path ("C:/Users/Home/Downloads/wisc_bc_data.csv")
#and stores its contents in a data frame named wisc_bc_data.
# Set the CRAN mirror
options(repos = "https://cloud.r-project.org/")

# Install the required package
install.packages("class")

## Installing package into 'C:/Users/Home/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'class' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'class'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Home\AppData\Local\R\win-library\4.3\00LOCK\class\libs\x64\class.dll
## to C:\Users\Home\AppData\Local\R\win-library\4.3\class\libs\x64\class.dll:
## Permission denied

## Warning: restored 'class'

## 
## The downloaded binary packages are in
##  C:\Users\Home\AppData\Local\Temp\Rtmpg3TKqj\downloaded_packages

library(class)

## Warning: package 'class' was built under R version 4.3.3

#read_csv() is a function from the readr package used to read CSV files.
library(readr)
wisc_bc_data <- read_csv("C:/Users/Home/Downloads/wisc_bc_data.csv")

## Rows: 569 Columns: 32

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): diagnosis
## dbl (31): id, radius_mean, texture_mean, perimeter_mean, area_mean, smoothne...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View(wisc_bc_data)

#stringAsFactors=FALSE specifies that character columns should not be converted to factors. By default,
#R converts character columns to factors, but setting this argument to FALSE prevents that behavior.

#rename dataset
wd<-wisc_bc_data

#str to look at the data, str=means structure 
str(wd)

## spc_tbl_ [569 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id                     : num [1:569] 842302 842517 84300903 84348301 84358402 ...
##  $ diagnosis              : chr [1:569] "M" "M" "M" "M" ...
##  $ radius_mean            : num [1:569] 18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num [1:569] 10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num [1:569] 122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num [1:569] 1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num [1:569] 0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num [1:569] 0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num [1:569] 0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave points_mean    : num [1:569] 0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num [1:569] 0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num [1:569] 0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num [1:569] 1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num [1:569] 0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num [1:569] 8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num [1:569] 153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num [1:569] 0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num [1:569] 0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num [1:569] 0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave points_se      : num [1:569] 0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num [1:569] 0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num [1:569] 0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num [1:569] 25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num [1:569] 17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num [1:569] 184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num [1:569] 2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num [1:569] 0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num [1:569] 0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num [1:569] 0.712 0.242 0.45 0.687 0.4 ...
##  $ concave points_worst   : num [1:569] 0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num [1:569] 0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num [1:569] 0.1189 0.089 0.0876 0.173 0.0768 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   diagnosis = col_character(),
##   ..   radius_mean = col_double(),
##   ..   texture_mean = col_double(),
##   ..   perimeter_mean = col_double(),
##   ..   area_mean = col_double(),
##   ..   smoothness_mean = col_double(),
##   ..   compactness_mean = col_double(),
##   ..   concavity_mean = col_double(),
##   ..   `concave points_mean` = col_double(),
##   ..   symmetry_mean = col_double(),
##   ..   fractal_dimension_mean = col_double(),
##   ..   radius_se = col_double(),
##   ..   texture_se = col_double(),
##   ..   perimeter_se = col_double(),
##   ..   area_se = col_double(),
##   ..   smoothness_se = col_double(),
##   ..   compactness_se = col_double(),
##   ..   concavity_se = col_double(),
##   ..   `concave points_se` = col_double(),
##   ..   symmetry_se = col_double(),
##   ..   fractal_dimension_se = col_double(),
##   ..   radius_worst = col_double(),
##   ..   texture_worst = col_double(),
##   ..   perimeter_worst = col_double(),
##   ..   area_worst = col_double(),
##   ..   smoothness_worst = col_double(),
##   ..   compactness_worst = col_double(),
##   ..   concavity_worst = col_double(),
##   ..   `concave points_worst` = col_double(),
##   ..   symmetry_worst = col_double(),
##   ..   fractal_dimension_worst = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

#remove the first column 
#The code you provided removes the first column from the data frame wd and assigns
#the resulting data frame to wd1
wd1<-wd[,-1]




#wd1$diagnosis: This syntax is used to subset the column named "diagnosis" 
#from the data frame wd1
#wd1$: The $ operator is used to access columns within a data frame by name.
#It allows you to refer to a specific column by providing its name after the $ symbol.
#view the table
View(wd1)
wd1$diagnosis

##   [1] "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
##  [19] "M" "B" "B" "B" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
##  [37] "M" "B" "M" "M" "M" "M" "M" "M" "M" "M" "B" "M" "B" "B" "B" "B" "B" "M"
##  [55] "M" "B" "M" "M" "B" "B" "B" "B" "M" "B" "M" "M" "B" "B" "B" "B" "M" "B"
##  [73] "M" "M" "B" "M" "B" "M" "M" "B" "B" "B" "M" "M" "B" "M" "M" "M" "B" "B"
##  [91] "B" "M" "B" "B" "M" "M" "B" "B" "B" "M" "M" "B" "B" "B" "B" "M" "B" "B"
## [109] "M" "B" "B" "B" "B" "B" "B" "B" "B" "M" "M" "M" "B" "M" "M" "B" "B" "B"
## [127] "M" "M" "B" "M" "B" "M" "M" "B" "M" "M" "B" "B" "M" "B" "B" "M" "B" "B"
## [145] "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "M"
## [163] "M" "B" "M" "B" "B" "M" "M" "B" "B" "M" "M" "B" "B" "B" "B" "M" "B" "B"
## [181] "M" "M" "M" "B" "M" "B" "M" "B" "B" "B" "M" "B" "B" "M" "M" "B" "M" "M"
## [199] "M" "M" "B" "M" "M" "M" "B" "M" "B" "M" "B" "B" "M" "B" "M" "M" "M" "M"
## [217] "B" "B" "M" "M" "B" "B" "B" "M" "B" "B" "B" "B" "B" "M" "M" "B" "B" "M"
## [235] "B" "B" "M" "M" "B" "M" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "M" "B"
## [253] "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "B" "B" "B" "B"
## [271] "B" "B" "M" "B" "M" "B" "B" "M" "B" "B" "M" "B" "M" "M" "B" "B" "B" "B"
## [289] "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "B" "B"
## [307] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "M" "B" "M"
## [325] "B" "B" "B" "B" "M" "M" "M" "B" "B" "B" "B" "M" "B" "M" "B" "M" "B" "B"
## [343] "B" "M" "B" "B" "B" "B" "B" "B" "B" "M" "M" "M" "B" "B" "B" "B" "B" "B"
## [361] "B" "B" "B" "B" "B" "M" "M" "B" "M" "M" "M" "B" "M" "M" "B" "B" "B" "B"
## [379] "B" "M" "B" "B" "B" "B" "B" "M" "B" "B" "B" "M" "B" "B" "M" "M" "B" "B"
## [397] "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B"
## [415] "M" "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B"
## [433] "M" "M" "B" "M" "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "B" "M"
## [451] "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "M" "M" "B" "B" "B" "B" "B" "B"
## [469] "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "B"
## [487] "B" "M" "B" "M" "B" "B" "M" "B" "B" "B" "B" "B" "M" "M" "B" "M" "B" "M"
## [505] "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "M" "M" "B" "B" "B" "M"
## [523] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "M" "M" "B" "B" "B"
## [541] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B"
## [559] "B" "B" "B" "B" "M" "M" "M" "M" "M" "M" "B"

table(wd1$diagnosis)

## 
##   B   M 
## 357 212

prop.table(table(wd1$diagnosis))

## 
##         B         M 
## 0.6274165 0.3725835

summary(wd1[c("radius_mean", "area_mean", "smoothness_mean")])

##   radius_mean       area_mean      smoothness_mean  
##  Min.   : 6.981   Min.   : 143.5   Min.   :0.05263  
##  1st Qu.:11.700   1st Qu.: 420.3   1st Qu.:0.08637  
##  Median :13.370   Median : 551.1   Median :0.09587  
##  Mean   :14.127   Mean   : 654.9   Mean   :0.09636  
##  3rd Qu.:15.780   3rd Qu.: 782.7   3rd Qu.:0.10530  
##  Max.   :28.110   Max.   :2501.0   Max.   :0.16340

#create normalization fucntion
normalize<-function(x){
  return((x-min(x)/(max(x) - min(x))))
  
}

normalize(c(1,2,3,4,5))

## [1] 0.75 1.75 2.75 3.75 4.75

#In the context of data normalization, "scaling" refers to the process of
#transforming the values of a variable to a specific range. This transformation
#is often performed to make the values more comparable or to ensure that they fall 
#within a specific range that is suitable for analysis or modeling.


#, the function scales the values of the vector c(1, 10, 100, 1000) between 0 and 1,
#proportionally to their position within the original range.
normalize(c(1,10,100,1000))

## [1]   0.998999   9.998999  99.998999 999.998999

#This code performs normalization on the columns of the data frame wd1 from columns 2 
#to 31 and stores the normalized values in a new data frame wd_n
wd_n<-as.data.frame(lapply(wd1[2:31],normalize))


summary(wd_n[c("radius_mean","area_mean","smoothness_mean")])

##   radius_mean       area_mean      smoothness_mean  
##  Min.   : 6.651   Min.   : 143.4   Min.   :-0.4225  
##  1st Qu.:11.370   1st Qu.: 420.2   1st Qu.:-0.3888  
##  Median :13.040   Median : 551.0   Median :-0.3793  
##  Mean   :13.797   Mean   : 654.8   Mean   :-0.3788  
##  3rd Qu.:15.450   3rd Qu.: 782.6   3rd Qu.:-0.3698  
##  Max.   :27.780   Max.   :2500.9   Max.   :-0.3117

wd1$diagnosis <- factor(wd1$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
str(wd1$diagnosis)

##  Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...

wd1_train<-wd_n[1:469, ]
wd1_test<-wd_n[470:569, ]

wd1_train_labels<-wd1[1:469, 1]
wd1_test_labels<-wd1[470:569, 1]
head(wd1_test_labels)

## # A tibble: 6 × 1
##   diagnosis
##   <fct>    
## 1 Benign   
## 2 Benign   
## 3 Benign   
## 4 Benign   
## 5 Benign   
## 6 Benign

sqrt(469)

## [1] 21.65641

install.packages("class")

## Warning: package 'class' is in use and will not be installed

install.packages("class", repos = "https://cloud.r-project.org/")

## Warning: package 'class' is in use and will not be installed

# you have to do this to brin it nto you R environment
library(class)

wd1_test_pred <- knn(train = wd1_train, test = wd1_test, cl = wd1_train_labels$diagnosis, k = 21)
wd1_test_pred

##   [1] Benign    Benign    Benign    Malignant Benign    Benign    Benign   
##   [8] Benign    Benign    Benign    Malignant Benign    Malignant Benign   
##  [15] Benign    Benign    Benign    Benign    Malignant Benign    Malignant
##  [22] Benign    Malignant Malignant Benign    Benign    Benign    Benign   
##  [29] Benign    Malignant Malignant Benign    Malignant Benign    Malignant
##  [36] Benign    Benign    Benign    Benign    Malignant Malignant Benign   
##  [43] Benign    Malignant Benign    Malignant Benign    Malignant Malignant
##  [50] Benign    Benign    Benign    Malignant Benign    Benign    Benign   
##  [57] Benign    Benign    Benign    Benign    Benign    Benign    Benign   
##  [64] Benign    Malignant Benign    Malignant Benign    Benign    Benign   
##  [71] Benign    Benign    Benign    Benign    Benign    Benign    Benign   
##  [78] Benign    Benign    Benign    Benign    Benign    Benign    Benign   
##  [85] Benign    Benign    Benign    Benign    Benign    Benign    Benign   
##  [92] Benign    Benign    Malignant Malignant Malignant Malignant Malignant
##  [99] Malignant Benign   
## Levels: Benign Malignant

#install gmodels
install.packages("gmodels")

## Installing package into 'C:/Users/Home/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'gmodels' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Home\AppData\Local\Temp\Rtmpg3TKqj\downloaded_packages

library(gmodels)

## Warning: package 'gmodels' was built under R version 4.3.3

CrossTable(x=wd1_test_labels$diagnosis, y=wd1_test_pred, prop.chisq=FALSE)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                           | wd1_test_pred 
## wd1_test_labels$diagnosis |    Benign | Malignant | Row Total | 
## --------------------------|-----------|-----------|-----------|
##                    Benign |        73 |         4 |        77 | 
##                           |     0.948 |     0.052 |     0.770 | 
##                           |     0.986 |     0.154 |           | 
##                           |     0.730 |     0.040 |           | 
## --------------------------|-----------|-----------|-----------|
##                 Malignant |         1 |        22 |        23 | 
##                           |     0.043 |     0.957 |     0.230 | 
##                           |     0.014 |     0.846 |           | 
##                           |     0.010 |     0.220 |           | 
## --------------------------|-----------|-----------|-----------|
##              Column Total |        74 |        26 |       100 | 
##                           |     0.740 |     0.260 |           | 
## --------------------------|-----------|-----------|-----------|
## 
##

Untitled

Nurudeen Hussain

2024-05-02

R Markdown

Including Plots