library(readr)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readxl)
library(datasets)
library(tidyr)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
cancer <- read_csv ("cancer.csv")
## Parsed with column specification:
## cols(
##   age = col_double(),
##   sex = col_character(),
##   height = col_double(),
##   weight = col_double(),
##   dateOfoperation = col_character(),
##   cancerStaging = col_character(),
##   hospitalization = col_double(),
##   diseaseCode = col_character()
## )
## Warning: 278 parsing failures.
##  row    col expected   actual         file
## 1294 height a double 기록없음 'cancer.csv'
## 1294 weight a double 기록없음 'cancer.csv'
## 1365 height a double 기록없음 'cancer.csv'
## 1430 height a double 기록없음 'cancer.csv'
## 1459 height a double 기록없음 'cancer.csv'
## .... ...... ........ ........ ............
## See problems(...) for more details.
View(cancer)
head(cancer)
## # A tibble: 6 x 8
##     age sex   height weight dateOfoperation cancerStaging hospitalization
##   <dbl> <chr>  <dbl>  <dbl> <chr>           <chr>                   <dbl>
## 1    75 남      161    64   2011.6.22       I                          48
## 2    52 여      177.   75.3 2011.5.19       IV                         17
## 3    67 여      154    65.6 2011.5.31       III                        10
## 4    62 남      162    57   2011.6.21       I                          11
## 5    70 남      171    65   2011.6.9        II                         10
## 6    76 여      171    87   2011.6.16       III                        10
## # ... with 1 more variable: diseaseCode <chr>
str(cancer)
## Classes 'tbl_df', 'tbl' and 'data.frame':    18310 obs. of  8 variables:
##  $ age            : num  75 52 67 62 70 76 55 72 64 71 ...
##  $ sex            : chr  "남" "여" "여" "남" ...
##  $ height         : num  161 177 154 162 171 ...
##  $ weight         : num  64 75.3 65.6 57 65 87 77 55 67 55.5 ...
##  $ dateOfoperation: chr  "2011.6.22" "2011.5.19" "2011.5.31" "2011.6.21" ...
##  $ cancerStaging  : chr  "I" "IV" "III" "I" ...
##  $ hospitalization: num  48 17 10 11 10 10 12 18 15 35 ...
##  $ diseaseCode    : chr  "C187" "C187" "C187" "C187" ...
##  - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 278 obs. of  5 variables:
##   ..$ row     : int  1294 1294 1365 1430 1459 1468 1468 1521 1521 1565 ...
##   ..$ col     : chr  "height" "weight" "height" "height" ...
##   ..$ expected: chr  "a double" "a double" "a double" "a double" ...
##   ..$ actual  : chr  "기록없음" "기록없음" "기록없음" "기록없음" ...
##   ..$ file    : chr  "'cancer.csv'" "'cancer.csv'" "'cancer.csv'" "'cancer.csv'" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   sex = col_character(),
##   ..   height = col_double(),
##   ..   weight = col_double(),
##   ..   dateOfoperation = col_character(),
##   ..   cancerStaging = col_character(),
##   ..   hospitalization = col_double(),
##   ..   diseaseCode = col_character()
##   .. )
dim(cancer)
## [1] 18310     8

Remove NA

newd <- na.omit(cancer)

Basic stat information with numeric variables I checked basic stat information with numeric variables. There were somo outliers for each variable and their mean age was

summary(cancer$height, na.rm = T)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   100.0   155.0   162.0   161.6   168.0   196.6     194
summary(cancer$weight, na.rm = T)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   26.00   54.00   61.00   61.48   68.40  118.00      84
summary(cancer$age, na.rm = T)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   55.00   64.00   63.51   72.00  102.00
summary(cancer$hospitalization, na.rm = T)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   10.00   13.00   16.83   19.00  354.00
boxplot(cancer$age)

Check Standard Deviation

sd(newd$height, na.rm = T )
## [1] 8.906801
sd(newd$weight, na.rm = T)
## [1] 10.74521
sd(newd$age, na.rm = T)
## [1] 11.58417
sd(newd$hospitalization, na.rm = T)
## [1] 11.83359

Check data with gender. Proportionally males have little higher rates to have cancer.

simplestat <- newd
simplestat %>% 
  group_by(sex) %>%
  summarise(mean_age = mean(age), 
            sum_age = sum(age),
            median_age = median(age),
            n = n())
## # A tibble: 2 x 5
##   sex   mean_age sum_age median_age     n
##   <chr>    <dbl>   <dbl>      <dbl> <int>
## 1 남        63.2  694419         64 10991
## 2 여        64.0  455278         66  7116

** Check data with cancer staging. This data visually explains the frequency of the data which was the 3rd cancer staging has the highest frequency.

staging <- table(newd$cancerStaging)
staging
## 
##        I       II      III       IV 기록없음 
##     4041     5176     6515     2146      229
barplot(staging)
title(main="Cancer Staging")
title(xlab = "staging")

hospital <- tapply(newd$hospitalization, newd$cancerStaging, mean)
hospital
##        I       II      III       IV 기록없음 
## 14.71616 16.93083 16.57452 20.39795 20.73362
barplot(hospital, ylim = c(0,50))

boxplot(newd$hospitalization~ newd$cancerStaging)

Find correlation between variables First I simply checked the correlation between a couple of variables and found (height and weight), (age and hospitalization) are moderately correlated, and days of hospotalization and height have no relation.

cor(newd$height, newd$weight)
## [1] 0.6146694
cor(newd$age, newd$hospitalization)
## [1] 0.06249857
cor(newd$hospitalization, newd$height)
## [1] -0.01643714
cor(newd$hospitalization, newd$age)
## [1] 0.06249857

With the above data, hypothesis are designed below. #Correlation

Test 1. #Ho: p= 0 There is no relationship between weight and height.
** #H1: p=/ 0 There is a relationship between weight and height. ** Pearson’s correlation was used. There was about 0.06(r) relationship between weight and height which is positive and very low. P value was low which means we can reject the null, and the data is between the confidence interval. Also, t is larger than alpha.

Test 2. #Ho: p= 0 There is no relationship between age and hospitalization. ** #H1: p=/ 0 There is a relationship between age and hospitalization.** Pearson’s correlation was used and there was about 0.61 relationship between weight and height which is positive and moderate. P value was low which means we can reject the null, and the data is between the confidence interval. Also, t is larger than alpha.

cor.test(newd$hospitalization, newd$age)
## 
##  Pearson's product-moment correlation
## 
## data:  newd$hospitalization and newd$age
## t = 8.426, df = 18105, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.04797658 0.07699413
## sample estimates:
##        cor 
## 0.06249857
cor(newd[,c(1,3,4)])
##                age      height      weight
## age     1.00000000 -0.01701435 -0.02763348
## height -0.01701435  1.00000000  0.61466938
## weight -0.02763348  0.61466938  1.00000000
cor.test(newd$height, newd$weight)
## 
##  Pearson's product-moment correlation
## 
## data:  newd$height and newd$weight
## t = 104.85, df = 18105, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6055250 0.6236515
## sample estimates:
##       cor 
## 0.6146694

Covarience and variance age and weight, age and height have negative correlation.

var(newd$height, na.rm= T)
## [1] 79.33111
var(newd$age, newd$weight)
## [1] -3.439661
cov(newd$age, newd$weight, use = "complete.obs")
## [1] -3.439661
var(newd[, c(1,3,4)], na.rm = T)
##               age    height     weight
## age    134.193098 -1.755505  -3.439661
## height  -1.755505 79.331107  58.827217
## weight  -3.439661 58.827217 115.459567

plot: weight and height has a moderate linear positive relationship. plot: age and hospitalization have a linear relationship.

DF2 <- data.frame(newd$height, newd$weight)
plot(DF2)

DF5 <- data.frame(newd$age, newd$hospitalization)
plot(DF5)

describe the process that you followed in this project as well as about what you learned from these data.

I first installed packages and look through the data by summary function. After figuring out the structures and some numeric and categorical information of the data, I removed the NAs. And, through the process of recognizing basic stat information with numeric variables(mean, range, median, standard deviation) I decided what my hypothesis will be for the correlation analysis. As having 4 numerically recorded variables which can be compared, There were some options. I used Pearson’s correlation and data were all rejected, which means there was correlation. What I learned from the data was below:
1. Males had a proportionally higher rate of cancer than female.
2. The highest frequency of ‘cancer staging’ out of four was three, though data did not have much gap.
3. There were extreme outliers on hospitalization. 4. Weight and height for people who have a cancer have a moderate linear positive correlation. 5. Age and hospitalization for people who have a cancer has a weak linear positive relationship.