This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
library(tidyverse)
## -- Attaching packages ---------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.6
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
read_csv("https://www.iun.edu/~cisjw/ds/files/data/weather.csv",
col_types = cols(
outlook = col_character(),
temperature = col_integer(),
humidity = col_integer(),
windy = col_character(),
play = col_character()
)
)
## # A tibble: 14 x 5
## outlook temperature humidity windy play
## <chr> <int> <int> <chr> <chr>
## 1 sunny 85 85 FALSE no
## 2 sunny 80 90 TRUE no
## 3 overcast 83 86 FALSE yes
## 4 rainy 70 96 FALSE yes
## 5 rainy 68 80 FALSE yes
## 6 rainy 65 70 TRUE no
## 7 overcast 64 65 TRUE yes
## 8 sunny 72 95 FALSE no
## 9 sunny 69 70 FALSE yes
## 10 rainy 75 80 FALSE yes
## 11 sunny 75 70 TRUE yes
## 12 overcast 72 90 TRUE yes
## 13 overcast 81 75 FALSE yes
## 14 rainy 71 91 TRUE no
train <- read_csv("https://www.iun.edu/~cisjw/ds/files/data/train.csv",
col_types = cols(
Survived = col_character(),
Pclass = col_character(),
Name = col_character(),
Sex = col_character(),
Age = col_integer(),
SibSp = col_integer(),
ParentChild = col_integer(),
TicketNumber = col_character(),
Fare = col_double(),
Cabin = col_character(),
Port = col_character(),
LiftBoat = col_integer()
))
## Warning: The following named parsers don't match the column names: LiftBoat
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 42 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 2 Age no trailing char~ .9167 'https://www.iun.edu/~cisjw/ds/fil~ file 2 169 Age no trailing char~ .5 'https://www.iun.edu/~cisjw/ds/fil~ row 3 217 Age no trailing char~ .5 'https://www.iun.edu/~cisjw/ds/fil~ col 4 219 Age no trailing char~ .5 'https://www.iun.edu/~cisjw/ds/fil~ expected 5 346 Age no trailing char~ .8333 'https://www.iun.edu/~cisjw/ds/fil~

## See problems(...) for more details.
train
## # A tibble: 1,244 x 12
## Survived Pclass Name Sex Age SibSp ParentChild TicketNumber Fare
## <chr> <chr> <chr> <chr> <int> <int> <int> <chr> <dbl>
## 1 Yes First Alle~ Fema~ 29 0 0 24160 211.
## 2 Yes First Alli~ Male NA 1 2 113781 152.
## 3 No First Alli~ Fema~ 2 1 2 113781 152.
## 4 No First Alli~ Male 30 1 2 113781 152.
## 5 No First Alli~ Fema~ 25 1 2 113781 152.
## 6 Yes First Ande~ Male 48 0 0 19952 26.6
## 7 Yes First Andr~ Fema~ 63 1 0 13502 78.0
## 8 No First Andr~ Male 39 0 0 112050 0
## 9 Yes First Appl~ Fema~ 53 2 0 11769 51.5
## 10 No First Arta~ Male 71 0 0 PC 17609 49.5
## # ... with 1,234 more rows, and 3 more variables: Cabin <chr>, Port <chr>,
## # LifeBoat <chr>
test <- read_csv("https://www.iun.edu/~cisjw/ds/files/data/test.csv",
col_types = cols(
Survived = col_character(),
Pclass = col_character(),
Name = col_character(),
Sex = col_character(),
Age = col_integer(),
SibSp = col_integer(),
ParentChild = col_integer(),
TicketNumber = col_character(),
Fare = col_double(),
Cabin = col_character(),
Port = col_character(),
LiftBoat = col_integer()
))
## Warning: The following named parsers don't match the column names: LiftBoat
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 3 parsing failures.
## row # A tibble: 3 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 24 Age no trailing chara~ .5 'https://www.iun.edu/~cisjw/ds/fi~ file 2 37 Age no trailing chara~ .1667 'https://www.iun.edu/~cisjw/ds/fi~ row 3 41 Age no trailing chara~ .5 'https://www.iun.edu/~cisjw/ds/fi~
test
## # A tibble: 65 x 12
## Survived Pclass Name Sex Age SibSp ParentChild TicketNumber Fare
## <chr> <chr> <chr> <chr> <int> <int> <int> <chr> <dbl>
## 1 No First Beat~ Male 36 0 0 13050 75.2
## 2 Yes First Cham~ Male 27 1 0 113806 53.1
## 3 Yes First Chau~ Fema~ 36 0 0 PC 17608 262.
## 4 Yes First Flem~ Fema~ NA 0 0 17421 111.
## 5 Yes First Fort~ Fema~ 24 3 2 19950 263
## 6 Yes First Mayn~ Fema~ 24 0 0 PC 17482 49.5
## 7 No First Ring~ Male 22 0 0 PC 17760 136.
## 8 Yes First Roth~ Fema~ 54 1 0 PC 17603 59.4
## 9 Yes First Snyd~ Fema~ 23 1 0 21228 82.3
## 10 Yes First Ston~ Fema~ 62 0 0 113572 80
## # ... with 55 more rows, and 3 more variables: Cabin <chr>, Port <chr>,
## # LifeBoat <chr>
train %>%
slice(1:5, )
## # A tibble: 5 x 12
## Survived Pclass Name Sex Age SibSp ParentChild TicketNumber Fare
## <chr> <chr> <chr> <chr> <int> <int> <int> <chr> <dbl>
## 1 Yes First Alle~ Fema~ 29 0 0 24160 211.
## 2 Yes First Alli~ Male NA 1 2 113781 152.
## 3 No First Alli~ Fema~ 2 1 2 113781 152.
## 4 No First Alli~ Male 30 1 2 113781 152.
## 5 No First Alli~ Fema~ 25 1 2 113781 152.
## # ... with 3 more variables: Cabin <chr>, Port <chr>, LifeBoat <chr>
test %>%
slice(1:5, )
## # A tibble: 5 x 12
## Survived Pclass Name Sex Age SibSp ParentChild TicketNumber Fare
## <chr> <chr> <chr> <chr> <int> <int> <int> <chr> <dbl>
## 1 No First Beat~ Male 36 0 0 13050 75.2
## 2 Yes First Cham~ Male 27 1 0 113806 53.1
## 3 Yes First Chau~ Fema~ 36 0 0 PC 17608 262.
## 4 Yes First Flem~ Fema~ NA 0 0 17421 111.
## 5 Yes First Fort~ Fema~ 24 3 2 19950 263
## # ... with 3 more variables: Cabin <chr>, Port <chr>, LifeBoat <chr>
train %>%
filter(!is.na(Age)) %>%
summarise(mean(Age))
## # A tibble: 1 x 1
## `mean(Age)`
## <dbl>
## 1 30.3
test %>%
filter(!is.na(Age)) %>%
summarise(mean(Age))
## # A tibble: 1 x 1
## `mean(Age)`
## <dbl>
## 1 26.2
train %>%
filter(Sex == "Female")
## # A tibble: 446 x 12
## Survived Pclass Name Sex Age SibSp ParentChild TicketNumber Fare
## <chr> <chr> <chr> <chr> <int> <int> <int> <chr> <dbl>
## 1 Yes First Alle~ Fema~ 29 0 0 24160 211.
## 2 No First Alli~ Fema~ 2 1 2 113781 152.
## 3 No First Alli~ Fema~ 25 1 2 113781 152.
## 4 Yes First Andr~ Fema~ 63 1 0 13502 78.0
## 5 Yes First Appl~ Fema~ 53 2 0 11769 51.5
## 6 Yes First Asto~ Fema~ 18 1 0 PC 17757 228.
## 7 Yes First Auba~ Fema~ 24 0 0 PC 17477 69.3
## 8 Yes First Barb~ Fema~ 26 0 0 19877 78.8
## 9 Yes First Baxt~ Fema~ 50 0 1 PC 17558 248.
## 10 Yes First Bazz~ Fema~ 32 0 0 11813 76.3
## # ... with 436 more rows, and 3 more variables: Cabin <chr>, Port <chr>,
## # LifeBoat <chr>
train %>%
filter(Survived == "Yes") %>%
filter(Sex == "Female")
## # A tibble: 325 x 12
## Survived Pclass Name Sex Age SibSp ParentChild TicketNumber Fare
## <chr> <chr> <chr> <chr> <int> <int> <int> <chr> <dbl>
## 1 Yes First Alle~ Fema~ 29 0 0 24160 211.
## 2 Yes First Andr~ Fema~ 63 1 0 13502 78.0
## 3 Yes First Appl~ Fema~ 53 2 0 11769 51.5
## 4 Yes First Asto~ Fema~ 18 1 0 PC 17757 228.
## 5 Yes First Auba~ Fema~ 24 0 0 PC 17477 69.3
## 6 Yes First Barb~ Fema~ 26 0 0 19877 78.8
## 7 Yes First Baxt~ Fema~ 50 0 1 PC 17558 248.
## 8 Yes First Bazz~ Fema~ 32 0 0 11813 76.3
## 9 Yes First Beck~ Fema~ 47 1 1 11751 52.6
## 10 Yes First Bido~ Fema~ 42 0 0 PC 17757 228.
## # ... with 315 more rows, and 3 more variables: Cabin <chr>, Port <chr>,
## # LifeBoat <chr>
normalize <- function(v, range=1){
v.norm = (v - min(na.omit(v)))/(max(na.omit(v)) - min(na.omit(v))) * range
return(v.norm)
}
## (vi - v1) / (vm - v1) * (1-0)
##
v <- c(-1, 3, 6, -2, 0)
v.norm <- normalize(v)
v.norm
## [1] 0.125 0.625 1.000 0.000 0.250
v <- c(-1, 3, 6, -2, 0)
w <- c(2, 3, 0, 56, 1)
dataset <- tibble(v, w) #creates dataset
dataset %>%
map(normalize) %>%
as_tibble
## # A tibble: 5 x 2
## v w
## <dbl> <dbl>
## 1 0.125 0.0357
## 2 0.625 0.0536
## 3 1 0
## 4 0 1
## 5 0.25 0.0179
sapply(dataset, normalize)
## v w
## [1,] 0.125 0.03571429
## [2,] 0.625 0.05357143
## [3,] 1.000 0.00000000
## [4,] 0.000 1.00000000
## [5,] 0.250 0.01785714
weather <- read_csv("Info- I 421/weather.csv")
## Parsed with column specification:
## cols(
## outlook = col_character(),
## temperature = col_integer(),
## humidity = col_integer(),
## windy = col_logical(),
## play = col_character()
## )
weather %>%
summarise_all(is.numeric)
## # A tibble: 1 x 5
## outlook temperature humidity windy play
## <lgl> <lgl> <lgl> <lgl> <lgl>
## 1 FALSE TRUE TRUE FALSE FALSE
weather.norm <- weather %>%
mutate_if(is.numeric, normalize)
weather.norm <- weather
types <- sapply(weather, is.numeric)
for (i in 1: length(types)) {
if(types[i] == TRUE){
weather.norm[ , i] <- normalize(weather[ , i])
}
}
weather.norm
## # A tibble: 14 x 5
## outlook temperature humidity windy play
## <chr> <dbl> <dbl> <lgl> <chr>
## 1 sunny 1 0.645 FALSE no
## 2 sunny 0.762 0.806 TRUE no
## 3 overcast 0.905 0.677 FALSE yes
## 4 rainy 0.286 1 FALSE yes
## 5 rainy 0.190 0.484 FALSE yes
## 6 rainy 0.0476 0.161 TRUE no
## 7 overcast 0 0 TRUE yes
## 8 sunny 0.381 0.968 FALSE no
## 9 sunny 0.238 0.161 FALSE yes
## 10 rainy 0.524 0.484 FALSE yes
## 11 sunny 0.524 0.161 TRUE yes
## 12 overcast 0.381 0.806 TRUE yes
## 13 overcast 0.810 0.323 FALSE yes
## 14 rainy 0.333 0.839 TRUE no
normalizeDataset <- function(data){
data.norm <- data
types <- sapply(data, is.numeric)
for (i in 1: length(types)) {
if(types[i] == TRUE){
data.norm[ , i] <- normalize(data[ , i])
}
}
return(data.norm)
}
weather.norm <- normalizeDataset(weather)
train.norm <- normalizeDataset(train)
train.norm
## # A tibble: 1,244 x 12
## Survived Pclass Name Sex Age SibSp ParentChild TicketNumber
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 Yes First Alle~ Fema~ 0.354 0 0 24160
## 2 Yes First Alli~ Male NA 0.125 0.222 113781
## 3 No First Alli~ Fema~ 0.0127 0.125 0.222 113781
## 4 No First Alli~ Male 0.367 0.125 0.222 113781
## 5 No First Alli~ Fema~ 0.304 0.125 0.222 113781
## 6 Yes First Ande~ Male 0.595 0 0 19952
## 7 Yes First Andr~ Fema~ 0.785 0.125 0 13502
## 8 No First Andr~ Male 0.481 0 0 112050
## 9 Yes First Appl~ Fema~ 0.658 0.25 0 11769
## 10 No First Arta~ Male 0.886 0 0 PC 17609
## # ... with 1,234 more rows, and 4 more variables: Fare <dbl>, Cabin <chr>,
## # Port <chr>, LifeBoat <chr>
test.norm <- normalizeDataset(test)
test.norm
## # A tibble: 65 x 12
## Survived Pclass Name Sex Age SibSp ParentChild TicketNumber Fare
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 No First Beat~ Male 0.567 0 0 13050 0.286
## 2 Yes First Cham~ Male 0.417 0.125 0 113806 0.202
## 3 Yes First Chau~ Fema~ 0.567 0 0 PC 17608 0.998
## 4 Yes First Flem~ Fema~ NA 0 0 17421 0.422
## 5 Yes First Fort~ Fema~ 0.367 0.375 0.667 19950 1
## 6 Yes First Mayn~ Fema~ 0.367 0 0 PC 17482 0.188
## 7 No First Ring~ Male 0.333 0 0 PC 17760 0.516
## 8 Yes First Roth~ Fema~ 0.867 0.125 0 PC 17603 0.226
## 9 Yes First Snyd~ Fema~ 0.35 0.125 0 21228 0.313
## 10 Yes First Ston~ Fema~ 1 0 0 113572 0.304
## # ... with 55 more rows, and 3 more variables: Cabin <chr>, Port <chr>,
## # LifeBoat <chr>