library(readr)
std <- read_delim("stud.txt", delim="\t",skip=1, col_names=c("StudentName","Math","English"))
Rows: 3 Columns: 3
── Column specification ──────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): StudentName
dbl (2): Math, English

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
std
# A tibble: 3 × 3
  StudentName  Math English
  <chr>       <dbl>   <dbl>
1 Anna           86      90
2 John           43      75
3 Catherine      80      82
library(tidyr)
stdL <- gather(std, Subject, Grade, Math:English)
stdL
# A tibble: 6 × 3
  StudentName Subject Grade
  <chr>       <chr>   <dbl>
1 Anna        Math       86
2 John        Math       43
3 Catherine   Math       80
4 Anna        English    90
5 John        English    75
6 Catherine   English    82
spread(stdL, Subject, Grade)
# A tibble: 3 × 3
  StudentName English  Math
  <chr>         <dbl> <dbl>
1 Anna             90    86
2 Catherine        82    80
3 John             75    43
std2 <- read_delim("stud2.txt", delim="\t",skip=1, col_names=c("StudentName","Math","English","Degree_Year"))
Rows: 2 Columns: 4
── Column specification ──────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): StudentName, Degree_Year
dbl (2): Math, English

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
std2
# A tibble: 2 × 4
  StudentName  Math English Degree_Year
  <chr>       <dbl>   <dbl> <chr>      
1 John           43      75 Math_2013  
2 Catherine      80      82 Bio_2012   
std2L <- gather(std2, Subject, Grade, Math:English)
std2L <- separate(std2L, Degree_Year, c("Degree","Year"))
std2L 
# A tibble: 4 × 5
  StudentName Degree Year  Subject Grade
  <chr>       <chr>  <chr> <chr>   <dbl>
1 John        Math   2013  Math       43
2 Catherine   Bio    2012  Math       80
3 John        Math   2013  English    75
4 Catherine   Bio    2012  English    82
library(lubridate)

Attaching package: ‘lubridate’

The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union
ymd("20151021")
[1] "2015-10-21"
ymd("2015/11/30")
[1] "2015-11-30"
myd("11.2012.3")
[1] "2012-11-03"
dmy_hms("2/12/2013 14:05:01")
[1] "2013-12-02 14:05:01 UTC"
mdy("120186")
[1] "1986-12-01"
dates <- c(20120521, "2010-12-12", "2007/01/5", "2015-2-04","Measured on 2014-12-6", "2013-7-25")
dates <- ymd(dates)
dates
[1] "2012-05-21" "2010-12-12" "2007-01-05" "2015-02-04" "2014-12-06" "2013-07-25"
data.frame(Dates=dates,WeekDay=wday(dates),nWeekDay=wday(dates,label=TRUE),Year=year(dates),Month=month(dates,label=TRUE))
       Dates WeekDay nWeekDay Year Month
1 2012-05-21       2      Mon 2012   May
2 2010-12-12       1      Sun 2010   Dec
3 2007-01-05       6      Fri 2007   Jan
4 2015-02-04       4      Wed 2015   Feb
5 2014-12-06       7      Sat 2014   Dec
6 2013-07-25       5      Thu 2013   Jul
date <- ymd_hms("20150823 18:00:05",tz="Asia/Tehran")
date
[1] "2015-08-23 18:00:05 +0430"

library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(stringr)
library(readr)
uci.repo <- "https://archive.ics.uci.edu/ml/machine-learning-databases/"
dataset <- "audiology/audiology.standardized"
dataF <- str_c(uci.repo,dataset,".data")
namesF <- str_c(uci.repo,dataset,".names")
## Reading the data file
data <- read_csv(url(dataF), col_names=FALSE, na="?")
Rows: 200 Columns: 71
── Column specification ──────────────────────────────────────────────────────────────────
Delimiter: ","
chr (11): X2, X4, X5, X6, X8, X59, X60, X64, X66, X70, X71
lgl (60): X1, X3, X7, X9, X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, X20, X21, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data
# A tibble: 200 × 71
   X1    X2       X3    X4     X5    X6    X7    X8    X9    X10   X11   X12   X13   X14  
   <lgl> <chr>    <lgl> <chr>  <chr> <chr> <lgl> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
 1 FALSE mild     FALSE normal norm… NA    TRUE  NA    FALSE FALSE FALSE FALSE FALSE FALSE
 2 FALSE moderate FALSE normal norm… NA    TRUE  NA    FALSE FALSE FALSE FALSE FALSE FALSE
 3 TRUE  mild     TRUE  NA     abse… mild  TRUE  NA    FALSE FALSE FALSE FALSE FALSE FALSE
 4 TRUE  mild     TRUE  NA     abse… mild  FALSE NA    FALSE FALSE FALSE FALSE FALSE FALSE
 5 TRUE  mild     FALSE normal norm… mild  TRUE  NA    FALSE FALSE FALSE FALSE FALSE FALSE
 6 TRUE  mild     FALSE normal norm… mild  TRUE  NA    FALSE FALSE FALSE FALSE FALSE FALSE
 7 FALSE mild     FALSE normal norm… mild  TRUE  NA    FALSE FALSE FALSE FALSE FALSE FALSE
 8 FALSE mild     FALSE normal norm… mild  TRUE  NA    FALSE FALSE FALSE FALSE FALSE FALSE
 9 FALSE severe   FALSE NA     NA    NA    TRUE  NA    FALSE FALSE FALSE FALSE FALSE FALSE
10 TRUE  mild     FALSE eleva… abse… mild  TRUE  NA    FALSE FALSE FALSE FALSE FALSE FALSE
# ℹ 190 more rows
# ℹ 57 more variables: X15 <lgl>, X16 <lgl>, X17 <lgl>, X18 <lgl>, X19 <lgl>, X20 <lgl>,
#   X21 <lgl>, X22 <lgl>, X23 <lgl>, X24 <lgl>, X25 <lgl>, X26 <lgl>, X27 <lgl>,
#   X28 <lgl>, X29 <lgl>, X30 <lgl>, X31 <lgl>, X32 <lgl>, X33 <lgl>, X34 <lgl>,
#   X35 <lgl>, X36 <lgl>, X37 <lgl>, X38 <lgl>, X39 <lgl>, X40 <lgl>, X41 <lgl>,
#   X42 <lgl>, X43 <lgl>, X44 <lgl>, X45 <lgl>, X46 <lgl>, X47 <lgl>, X48 <lgl>,
#   X49 <lgl>, X50 <lgl>, X51 <lgl>, X52 <lgl>, X53 <lgl>, X54 <lgl>, X55 <lgl>, …
# ℹ Use `print(n = ...)` to see more rows
dim(data)
[1] 200  71
text <- read_lines(url(namesF))
text[1:3]
[1] "WARNING: This database should be credited to the original owner whenever"
[2] "         used for any publication whatsoever."                           
[3] ""                                                                        
length(text)
[1] 178
text[67:70]
[1] "   age_gt_60:\t\t     f, t."                            
[2] "   air():\t\t     mild,moderate,severe,normal,profound."
[3] "   airBoneGap:\t\t     f, t."                           
[4] "   ar_c():\t\t     normal,elevated,absent."             
nms <- str_split_fixed(text[67:135],":",n=2)[,1] # get the names
nms[1:3]
[1] "   age_gt_60"  "   air()"      "   airBoneGap"
nms <- str_trim(nms) # trim white space
nms[1:3]
[1] "age_gt_60"  "air()"      "airBoneGap"
nms <- str_replace_all(nms,"\\(|\\)","") # delete invalid chars.
nms[1:3]
[1] "age_gt_60"  "air"        "airBoneGap"
colnames(data)[1:69] <- nms
data[1:3,1:10]
# A tibble: 3 × 10
  age_gt_60 air      airBoneGap ar_c   ar_u   bone  boneAbnormal bser  history_buzzing
  <lgl>     <chr>    <lgl>      <chr>  <chr>  <chr> <lgl>        <chr> <lgl>          
1 FALSE     mild     FALSE      normal normal NA    TRUE         NA    FALSE          
2 FALSE     moderate FALSE      normal normal NA    TRUE         NA    FALSE          
3 TRUE      mild     TRUE       NA     absent mild  TRUE         NA    FALSE          
# ℹ 1 more variable: history_dizziness <lgl>
library(readr)
dat <- read_delim("dat.txt",delim = "\t",col_names=c("X","Y"))
Rows: 4 Columns: 2
── Column specification ──────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): X, Y

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat
# A tibble: 4 × 2
  X     Y    
  <chr> <chr>
1 green 56   
2 blue  ?    
3 green 100  
4 red   -10  
class(dat$Y)
[1] "character"
library(readr)
dat$Y <- parse_integer(dat$Y, na="?")
dat
# A tibble: 4 × 2
  X         Y
  <chr> <int>
1 green    56
2 blue     NA
3 green   100
4 red     -10
class(dat$Y)
[1] "integer"
library(dplyr)
data(iris)
iris.stand <- cbind(scale(select(iris,-Species)),select(iris,Species))
summary(iris.stand)
  Sepal.Length       Sepal.Width       Petal.Length      Petal.Width            Species  
 Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422   setosa    :50  
 1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799   versicolor:50  
 Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321   virginica :50  
 Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000                  
 3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880                  
 Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064                  
mxs <- apply(select(iris,-Species), 2, max, na.rm=TRUE)
mns <- apply(select(iris,-Species), 2, min, na.rm=TRUE)
iris.norm <- cbind(scale(select(iris,-Species), center=mns, scale=mxs-mns),select(iris,Species))
summary(iris.norm)
  Sepal.Length     Sepal.Width      Petal.Length     Petal.Width            Species  
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   setosa    :50  
 1st Qu.:0.2222   1st Qu.:0.3333   1st Qu.:0.1017   1st Qu.:0.08333   versicolor:50  
 Median :0.4167   Median :0.4167   Median :0.5678   Median :0.50000   virginica :50  
 Mean   :0.4287   Mean   :0.4406   Mean   :0.4675   Mean   :0.45806                  
 3rd Qu.:0.5833   3rd Qu.:0.5417   3rd Qu.:0.6949   3rd Qu.:0.70833                  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000                  
library(Hmisc) # for cut2()
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio

Attaching package: ‘Hmisc’

The following objects are masked from ‘package:dplyr’:

    src, summarize

The following objects are masked from ‘package:base’:

    format.pval, units
data(Boston, package="MASS") # loading the data
summary(Boston$age) # the numeric variable we are going to discretize
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   2.90   45.02   77.50   68.57   94.08  100.00 
Boston$newAge <- cut(Boston$age,5)
table(Boston$newAge)

 (2.8,22.3] (22.3,41.7] (41.7,61.2] (61.2,80.6]  (80.6,100] 
         45          71          70          81         239 
Boston$newAge <- cut(Boston$age,5,labels=c("verynew","new","normal","old","veryold")) # alternative using our own labels for the bins
table(Boston$newAge)

verynew     new  normal     old veryold 
     45      71      70      81     239 
Boston$newAge <- cut2(Boston$age, g=5)
table(Boston$newAge)

[ 2.9, 38.1) [38.1, 66.1) [66.1, 86.1) [86.1, 95.7) [95.7,100.0] 
         102          101          101          101          101 
Boston$newAge <- factor(cut2(Boston$age,g=5),labels=c("verynew","new","normal","old","veryold"))
table(Boston$newAge)

verynew     new  normal     old veryold 
    102     101     101     101     101 
library(lubridate)
library(xts)
Loading required package: zoo

Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric


######################### Warning from 'xts' package ##########################
#                                                                             #
# The dplyr lag() function breaks how base R's lag() function is supposed to  #
# work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
# source() into this session won't work correctly.                            #
#                                                                             #
# Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
# conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
# dplyr from breaking base R's lag() function.                                #
#                                                                             #
# Code in packages is not affected. It's protected by R's namespace mechanism #
# Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#                                                                             #
###############################################################################

Attaching package: ‘xts’

The following objects are masked from ‘package:dplyr’:

    first, last
sp500 <- xts(c(1102.94,1104.49,1115.71,1118.31),ymd(c("2010-02-25","2010-02-26","2010-03-01","2010-03-02")),tz=Sys.getenv("TZ"))
sp500
              [,1]
2010-02-25 1102.94
2010-02-26 1104.49
2010-03-01 1115.71
2010-03-02 1118.31
sp500["2010-03-02"]
              [,1]
2010-03-02 1118.31
sp500["2010-03"]
              [,1]
2010-03-01 1115.71
2010-03-02 1118.31
sp500["2010-03-01/"]
              [,1]
2010-03-01 1115.71
2010-03-02 1118.31
sp500["2010-02-26/2010-03-01"]
              [,1]
2010-02-26 1104.49
2010-03-01 1115.71
library(xts)
data(AirPassengers)
ap <- as.xts(AirPassengers);ap
         m.c.seq.row..seq.n...seq.col..drop...FALSE.
Jan 1949                                         112
Feb 1949                                         118
Mar 1949                                         132
Apr 1949                                         129
May 1949                                         121
Jun 1949                                         135
Jul 1949                                         148
Aug 1949                                         148
Sep 1949                                         136
Oct 1949                                         119
     ...                                            
Mar 1960                                         419
Apr 1960                                         461
May 1960                                         472
Jun 1960                                         535
Jul 1960                                         622
Aug 1960                                         606
Sep 1960                                         508
Oct 1960                                         461
Nov 1960                                         390
Dec 1960                                         432
apRel <- diff(ap)/ap[-length(ap)];apRel
                  e1
Jan 1949          NA
Feb 1949  0.05084746
Mar 1949  0.10606061
Apr 1949 -0.02325581
May 1949 -0.06611570
Jun 1949  0.10370370
Jul 1949  0.08783784
Aug 1949  0.00000000
Sep 1949 -0.08823529
Oct 1949 -0.14285714
     ...            
Feb 1960 -0.06649616
Mar 1960  0.06682578
Apr 1960  0.09110629
May 1960  0.02330508
Jun 1960  0.11775701
Jul 1960  0.13987138
Aug 1960 -0.02640264
Sep 1960 -0.19291339
Oct 1960 -0.10195228
Nov 1960 -0.18205128
head(ap)
         [,1]
Jan 1949  112
Feb 1949  118
Mar 1949  132
Apr 1949  129
May 1949  121
Jun 1949  135
head(embed(ap,4))
     [,1] [,2] [,3] [,4]
[1,]  129  132  118  112
[2,]  121  129  132  118
[3,]  135  121  129  132
[4,]  148  135  121  129
[5,]  148  148  135  121
[6,]  136  148  148  135
createEmbedDS <- function(s, emb=4) {d<- dim(s)
if (!is.null(d) && d[2] > 1) stop("Only applicable to uni-variate time series")
if (emb < 2 || emb > length(s)) stop("Invalid embed size")
e <- embed(s,emb)
colnames(e) <- c("T",paste("T",1:(emb-1),sep="_"))
if (is.xts(s)) return(xts(e,index(s)[emb:length(s)])) else return(e)
}
dataSet <- createEmbedDS(ap,emb=5)
head(dataSet)
           T T_1 T_2 T_3 T_4
May 1949 121 129 132 118 112
Jun 1949 135 121 129 132 118
Jul 1949 148 135 121 129 132
Aug 1949 148 148 135 121 129
Sep 1949 136 148 148 135 121
Oct 1949 119 136 148 148 135
library(readr)
ff <- read_csv("forestFires.csv")
Rows: 25000 Columns: 13
── Column specification ──────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (13): FID_CID, ano1991, ano1992, ano1993, ano1994, ano1995, ano1996, ano1997, ano1...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ff
# A tibble: 25,000 × 13
   FID_CID ano1991 ano1992 ano1993 ano1994 ano1995 ano1996 ano1997 ano1998 ano1999 ano2000
     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
 1       1       0       0       0       0       0       0       0       0       0       0
 2       2       0       0       0       0       0       0       0       0       0       0
 3       3       0       0       0       0       0       0       0       0       0       0
 4       4       0       0       0       0       0       0       0       0       0       0
 5       5       0       0       0       0       0       0       0       0       0       0
 6       6       0       0       0       0       0       0       0       0       1       0
 7       7       0       0       0       0       0       0       0       0       0       0
 8       8       0       0       0       0       0       0       0       0       0       0
 9       9       0       0       0       0       0       0       0       0       0       0
10      10       0       0       0       0       0       0       0       0       0       0
# ℹ 24,990 more rows
# ℹ 2 more variables: x <dbl>, y <dbl>
# ℹ Use `print(n = ...)` to see more rows
library(sp)
library(dplyr)
spatialCoords <- select(ff,long=x,lat=y) # the contextual data
firesData <- select(ff,ano2000) # the behavioral data
coordRefSys <- CRS("+proj=longlat +ellps=WGS84")
fires2000 <- SpatialPointsDataFrame(spatialCoords,
firesData,
proj4string=coordRefSys)
fires2000[1:3,]
          coordinates ano2000
1 (-7.31924, 38.5406)       0
2 (-7.63557, 40.5022)       0
3 (-7.90273, 40.3418)       0
bbox(fires2000)
          min      max
long -9.49174 -6.20743
lat  36.98050 42.14360
coordinates(fires2000)[1:3,]
         long     lat
[1,] -7.31924 38.5406
[2,] -7.63557 40.5022
[3,] -7.90273 40.3418
summary(fires2000)
Object of class SpatialPointsDataFrame
Coordinates:
          min      max
long -9.49174 -6.20743
lat  36.98050 42.14360
Is projected: FALSE 
proj4string : [+proj=longlat +ellps=WGS84]
Number of points: 25000
Data attributes:
    ano2000       
 Min.   :0.00000  
 1st Qu.:0.00000  
 Median :0.00000  
 Mean   :0.01612  
 3rd Qu.:0.00000  
 Max.   :1.00000  
library(ggplot2)
library(ggmap)
ℹ Google's Terms of Service: <https://mapsplatform.google.com>
  Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service/>
  OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles/>
ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
library(tibble)
mapPT <- get_map(center="Portugal",zoom=3)
Error in `get_googlemap()`:
! Google now requires an API key; see `ggmap::register_google()`.
Backtrace:
 1. ggmap::get_map(center = "Portugal", zoom = 3)
 2. ggmap::get_googlemap(...)
---
title: "Data Mining Sec.3.5"
output:
  html_notebook: default
  chunk_output_type: console
  pdf_document: default
  html_document:
    df_print: paged
  word_document: default
editor_options:
  chunk_output_type: inline
---

```{r Tidy Dtat, echo=TRUE, message=FALSE, warning=FALSE, paged.print=FALSE}
library(readr)
std <- read_delim("stud.txt", delim="\t",skip=1, col_names=c("StudentName","Math","English"))
std

library(tidyr)
stdL <- gather(std, Subject, Grade, Math:English)
stdL

spread(stdL, Subject, Grade)

std2 <- read_delim("stud2.txt", delim="\t",skip=1, col_names=c("StudentName","Math","English","Degree_Year"))
std2

std2L <- gather(std2, Subject, Grade, Math:English)
std2L <- separate(std2L, Degree_Year, c("Degree","Year"))
std2L 

```

```{r LubriDate, echo=TRUE, message=FALSE, warning=FALSE, paged.print=FALSE}
library(lubridate)
ymd("20151021")
ymd("2015/11/30")
myd("11.2012.3")
dmy_hms("2/12/2013 14:05:01")
mdy("120186")

```

```{r echo=TRUE, message=FALSE, warning=FALSE, paged.print=FALSE}
dates <- c(20120521, "2010-12-12", "2007/01/5", "2015-2-04","Measured on 2014-12-6", "2013-7-25")
dates <- ymd(dates)
dates

```

```{r echo=TRUE, message=FALSE, warning=FALSE, paged.print=FALSE}
data.frame(Dates=dates,WeekDay=wday(dates),nWeekDay=wday(dates,label=TRUE),Year=year(dates),Month=month(dates,label=TRUE))
date <- ymd_hms("20150823 18:00:05",tz="Asia/Tehran")
date

```


```{r String Processing, echo=TRUE, message=FALSE, warning=FALSE, paged.print=FALSE}

library(dplyr)
library(stringr)
library(readr)
uci.repo <- "https://archive.ics.uci.edu/ml/machine-learning-databases/"
dataset <- "audiology/audiology.standardized"
dataF <- str_c(uci.repo,dataset,".data")
namesF <- str_c(uci.repo,dataset,".names")
## Reading the data file
data <- read_csv(url(dataF), col_names=FALSE, na="?")
data

dim(data)
text <- read_lines(url(namesF))
text[1:3]
length(text)
text[67:70]
nms <- str_split_fixed(text[67:135],":",n=2)[,1] # get the names
nms[1:3]
nms <- str_trim(nms) # trim white space
nms[1:3]
nms <- str_replace_all(nms,"\\(|\\)","") # delete invalid chars.
nms[1:3]
colnames(data)[1:69] <- nms
data[1:3,1:10]
```


```{r Dealing with Unknown Values, echo=TRUE, warning=FALSE, paged.print=FALSE}
library(readr)
dat <- read_delim("dat.txt",delim = "\t",col_names=c("X","Y"))
dat
class(dat$Y)

library(readr)
dat$Y <- parse_integer(dat$Y, na="?")
dat
class(dat$Y)


```


```{r Handling Different Scales of Variables, echo=TRUE, message=FALSE, warning=FALSE, paged.print=FALSE}
library(dplyr)
data(iris)
iris.stand <- cbind(scale(select(iris,-Species)),select(iris,Species))
summary(iris.stand)
mxs <- apply(select(iris,-Species), 2, max, na.rm=TRUE)
mns <- apply(select(iris,-Species), 2, min, na.rm=TRUE)
iris.norm <- cbind(scale(select(iris,-Species), center=mns, scale=mxs-mns),select(iris,Species))
summary(iris.norm)
```


```{r Discretizing Variables, echo=TRUE, message=FALSE, warning=FALSE, paged.print=FALSE}
library(Hmisc) # for cut2()
data(Boston, package="MASS") # loading the data
summary(Boston$age) # the numeric variable we are going to discretize

Boston$newAge <- cut(Boston$age,5)
table(Boston$newAge)

Boston$newAge <- cut(Boston$age,5,labels=c("verynew","new","normal","old","veryold")) # alternative using our own labels for the bins
table(Boston$newAge)
Boston$newAge <- cut2(Boston$age, g=5)
table(Boston$newAge)

Boston$newAge <- factor(cut2(Boston$age,g=5),labels=c("verynew","new","normal","old","veryold"))
table(Boston$newAge)

```


```{r Creating Variables, echo=TRUE, message=FALSE, warning=FALSE, paged.print=FALSE}
library(lubridate)
library(xts)
sp500 <- xts(c(1102.94,1104.49,1115.71,1118.31),ymd(c("2010-02-25","2010-02-26","2010-03-01","2010-03-02")),tz=Sys.getenv("TZ"))
sp500

sp500["2010-03-02"]
sp500["2010-03"]
sp500["2010-03-01/"]
sp500["2010-02-26/2010-03-01"]


```

```{r Time dependencies, echo=TRUE, message=FALSE, warning=FALSE, paged.print=FALSE}
library(xts)
data(AirPassengers)
ap <- as.xts(AirPassengers);ap
apRel <- diff(ap)/ap[-length(ap)];apRel

head(ap)
head(embed(ap,4))

createEmbedDS <- function(s, emb=4) {d<- dim(s)
if (!is.null(d) && d[2] > 1) stop("Only applicable to uni-variate time series")
if (emb < 2 || emb > length(s)) stop("Invalid embed size")
e <- embed(s,emb)
colnames(e) <- c("T",paste("T",1:(emb-1),sep="_"))
if (is.xts(s)) return(xts(e,index(s)[emb:length(s)])) else return(e)
}
dataSet <- createEmbedDS(ap,emb=5)
head(dataSet)



```

```{r Spatial dependencies, echo=TRUE, message=FALSE, warning=FALSE, paged.print=FALSE}
library(readr)
ff <- read_csv("forestFires.csv")
ff

library(sp)
library(dplyr)
spatialCoords <- select(ff,long=x,lat=y) # the contextual data
firesData <- select(ff,ano2000) # the behavioral data
coordRefSys <- CRS("+proj=longlat +ellps=WGS84")
fires2000 <- SpatialPointsDataFrame(spatialCoords,
firesData,
proj4string=coordRefSys)
fires2000[1:3,]

bbox(fires2000)
coordinates(fires2000)[1:3,]
summary(fires2000)

library(ggplot2)
library(ggmap)
library(tibble)
mapPT <- get_map(center="Portugal",zoom=3)
d4plot <- as_tibble(cbind(coordinates(fires2000),burnt=fires2000$ano2000))
ggmap(mapPT) + geom_point(data=filter(d4plot, burnt==1),aes(x=long,y=lat),col="orange")

```

```{r Handling Text Datasets, echo=TRUE, message=FALSE, warning=FALSE, paged.print=FALSE}
library(tm)
setwd("E:/")
docs <- Corpus(DirSource("Documents"))
docs

docs[[2]]
content(docs[[2]])[1:3]

docs <- docs %>%
tm_map(removePunctuation) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords("en")) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)
content(docs[[2]])[1:3]

data <- DocumentTermMatrix(docs, control=list(weighting=weightTfIdf))
data

inspect(data[1:2,1:5])

findFreqTerms(data,0.9)
findAssocs(data,"race",0.5)

newData <- removeSparseTerms(data,0.7)
newData

library(tibble)
as_tibble(as.matrix(newData))
```

```{r Dimensionality Reduction, echo=TRUE, message=FALSE, warning=FALSE, paged.print=FALSE}
data(iris)
prop2sample <- 0.5
rowIDs <- sample(1:nrow(iris), as.integer(prop2sample*nrow(iris)), replace=TRUE)
iris.sample <- iris[rowIDs,]

nrLinesFile <- function(f) {
if (.Platform$OS.type == "windows")
as.integer(strsplit(trimws(system(paste("wc -l",f),intern=TRUE)),
" ")[[1]][1])
else
stop("This function requires unix-based systems")
}

sampleCSV <- function(file, percORn, nrLines, header=TRUE, mxPerc=0.5) {
if (.Platform$OS.type != "windows")
stop("This function requires unix-based systems")
require(readr, quietly=TRUE)
if (missing(nrLines)) nrLines <- nrLinesFile(file)
if (percORn < 1)
if (percORn > mxPerc)
stop("This function is not adequate for that big samples.")
else percORn <- as.integer(percORn*nrLines)
perc <- min(2*percORn/nrLines, mxPerc)
system(paste0("perl -ne 'print if (rand() < ",perc,")' ",file,
" ",file,".tmp.csv"))
dt <- read_csv(paste0(file,".tmp.csv"),col_names=header, n_max=percORn)
file.remove(paste0(file,".tmp.csv"))
if (nrow(dt) != percORn)
warning(paste("Expecting",percORn,"rows, but got",nrow(dt)))
dt
}

t <- Sys.time()
library(readr)
library(DMwR2)
d <- sampleCSV("Allsensors.csv", percORn=0.01, nrLines=376320, header=TRUE, mxPerc=0.5)

Sys.time()-t

nrow(d)

library(DBI)
library(RMySQL)
drv <- dbDriver("MySQL") # Loading the MySQL driver
con <- dbConnect(drv,dbname="transDB",
username="myuser",password="mypassword",
host="localhost")


sampleDBMS <- function(dbConn, tbl, percORn, mxPerc=0.5) {
nrRecords <- unlist(dbGetQuery(dbConn, paste("select count(*) from",tbl)))
if (percORn < 1)
if (percORn >mxPerc)
stop("This function is not adequate for that big samples.")
else percORn <- as.integer(percORn*nrRecords)
perc <- min(2*percORn/nrRecords, mxPerc)
dt <- dbGetQuery(dbConn,paste("select * from (select * from",tbl,
"where rand() <= ",perc,") as t limit ",percORn))
if (nrow(dt) != percORn)
warning(paste("Expecting",percORn,"rows, but got",nrow(dt)))
dt
}


t1 <- Sys.time()
d <- sampleDBMS(con,"sensor_values",10000)
Sys.time()-t1

nrow(d)

dbDisconnect(con)

dbUnloadDriver(drv)

```

```{r Variable Selection, echo=TRUE, message=FALSE, warning=FALSE, paged.print=FALSE}
library(CORElearn)
data(iris)
attrEval(Species ~ ., iris, estimator="GainRatio")

attrEval(Species ~ ., iris, estimator="InfGain")

attrEval(Species ~ ., iris, estimator="Gini")

attrEval(Species ~ ., iris, estimator="MDL")

infoCore(what="attrEval")

data(algae, package ="DMwR2")
attrEval(a1 ~ ., algae[,1:12], estimator="MSEofMean")

attrEval(a1 ~ ., algae[,1:12], estimator="RReliefFexpRank")

infoCore(what="attrEvalReg")

data(iris)
pca.data <- iris[,-5] # each case is described by the first 4 variables
pca <- princomp(pca.data)
loadings(pca)

pca$scores[1:5,]

dim(iris)

reduced.iris <- data.frame(pca$scores[,1:2],Species=iris$Species)
dim(reduced.iris)

head(reduced.iris)
```



