data <- read.csv('deltaDNA-Manipulation-with-R.csv')
library(RPostgreSQL)
## Loading required package: DBI
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data <- tbl_df(data)
data
## # A tibble: 15,781 × 6
## X user_id time gender age_group
## <int> <fctr> <fctr> <fctr> <fctr>
## 1 1 1000-0018-1034-FAKE 16/06/2015 12:44 MALE 30-34
## 2 2 1000-0004-1582-FAKE 08/06/2015 05:11 FEMALE 35-39
## 3 3 1000-0005-8033-FAKE 08/06/2015 03:32 FEMALE 40-49
## 4 4 1000-0008-6689-FAKE 08/06/2015 04:20 FEMALE 35-39
## 5 5 1000-0009-4274-FAKE 08/06/2015 17:49 FEMALE 35-39
## 6 6 1000-0009-7488-FAKE 08/06/2015 08:06 FEMALE 30-34
## 7 7 1000-0009-8265-FAKE 08/06/2015 08:25 FEMALE 40-49
## 8 8 1000-0010-0709-FAKE 08/06/2015 05:50 FEMALE 40-49
## 9 9 1000-0010-4359-FAKE 08/06/2015 05:54 FEMALE 25-29
## 10 10 1000-0010-7904-FAKE 08/06/2015 03:18 FEMALE 18-24
## # ... with 15,771 more rows, and 1 more variables: spend_amount <int>
data <- select(data,user_id:spend_amount)
data
## # A tibble: 15,781 × 5
## user_id time gender age_group spend_amount
## <fctr> <fctr> <fctr> <fctr> <int>
## 1 1000-0018-1034-FAKE 16/06/2015 12:44 MALE 30-34 399
## 2 1000-0004-1582-FAKE 08/06/2015 05:11 FEMALE 35-39 999
## 3 1000-0005-8033-FAKE 08/06/2015 03:32 FEMALE 40-49 299
## 4 1000-0008-6689-FAKE 08/06/2015 04:20 FEMALE 35-39 399
## 5 1000-0009-4274-FAKE 08/06/2015 17:49 FEMALE 35-39 99
## 6 1000-0009-7488-FAKE 08/06/2015 08:06 FEMALE 30-34 1499
## 7 1000-0009-8265-FAKE 08/06/2015 08:25 FEMALE 40-49 99
## 8 1000-0010-0709-FAKE 08/06/2015 05:50 FEMALE 40-49 399
## 9 1000-0010-4359-FAKE 08/06/2015 05:54 FEMALE 25-29 299
## 10 1000-0010-7904-FAKE 08/06/2015 03:18 FEMALE 18-24 549
## # ... with 15,771 more rows
table(data$age_group)
##
## 13-17 18-24 25-29 30-34 35-39 40-49 50-64 UNKNOWN
## 1110 2994 2298 1860 1817 2887 2532 283
select(data,time,gender)
## # A tibble: 15,781 × 2
## time gender
## <fctr> <fctr>
## 1 16/06/2015 12:44 MALE
## 2 08/06/2015 05:11 FEMALE
## 3 08/06/2015 03:32 FEMALE
## 4 08/06/2015 04:20 FEMALE
## 5 08/06/2015 17:49 FEMALE
## 6 08/06/2015 08:06 FEMALE
## 7 08/06/2015 08:25 FEMALE
## 8 08/06/2015 05:50 FEMALE
## 9 08/06/2015 05:54 FEMALE
## 10 08/06/2015 03:18 FEMALE
## # ... with 15,771 more rows
filter(data,spend_amount > 999)
## # A tibble: 1,559 × 5
## user_id time gender age_group spend_amount
## <fctr> <fctr> <fctr> <fctr> <int>
## 1 1000-0009-7488-FAKE 08/06/2015 08:06 FEMALE 30-34 1499
## 2 1000-0013-5999-FAKE 08/06/2015 16:43 FEMALE 50-64 2999
## 3 1000-0008-8492-FAKE 09/06/2015 04:17 FEMALE 35-39 1999
## 4 1000-0009-8265-FAKE 09/06/2015 07:39 FEMALE 40-49 1499
## 5 1000-0020-0396-FAKE 09/06/2015 09:14 MALE 18-24 1499
## 6 1000-0013-6070-FAKE 10/06/2015 02:14 FEMALE 40-49 1999
## 7 1000-0016-4715-FAKE 10/06/2015 07:45 FEMALE 18-24 1499
## 8 1000-0015-0402-FAKE 11/06/2015 13:19 FEMALE 18-24 1499
## 9 1000-0008-8492-FAKE 12/06/2015 01:01 FEMALE 35-39 1499
## 10 1000-0012-0794-FAKE 12/06/2015 02:37 FEMALE 35-39 2499
## # ... with 1,549 more rows
arrange(data,time)
## # A tibble: 15,781 × 5
## user_id time gender age_group spend_amount
## <fctr> <fctr> <fctr> <fctr> <int>
## 1 1000-0022-0401-FAKE 01/07/2015 00:06 MALE 30-34 299
## 2 1000-0018-4800-FAKE 01/07/2015 00:09 FEMALE 30-34 99
## 3 1000-0006-5869-FAKE 01/07/2015 00:10 FEMALE 35-39 99
## 4 1000-0018-4348-FAKE 01/07/2015 00:17 FEMALE 50-64 2499
## 5 1000-0010-8753-FAKE 01/07/2015 00:21 FEMALE 30-34 549
## 6 1000-0020-3104-FAKE 01/07/2015 00:36 FEMALE 13-17 549
## 7 1000-0010-7115-FAKE 01/07/2015 00:38 FEMALE 25-29 99
## 8 1000-0019-0655-FAKE 01/07/2015 00:39 FEMALE 13-17 99
## 9 1000-0010-5767-FAKE 01/07/2015 00:39 FEMALE 50-64 2499
## 10 1000-0008-5180-FAKE 01/07/2015 00:40 FEMALE 18-24 1999
## # ... with 15,771 more rows
data.a <- arrange(data,desc(time))
data.c <- filter(data.a,spend_amount<999)
summarise(data, mean_spend=mean(spend_amount),
median_spend = median(spend_amount))
## # A tibble: 1 × 2
## mean_spend median_spend
## <dbl> <int>
## 1 524.5782 299
glimpse(data)
## Observations: 15,781
## Variables: 5
## $ user_id <fctr> 1000-0018-1034-FAKE, 1000-0004-1582-FAKE, 1000-0...
## $ time <fctr> 16/06/2015 12:44, 08/06/2015 05:11, 08/06/2015 0...
## $ gender <fctr> MALE, FEMALE, FEMALE, FEMALE, FEMALE, FEMALE, FE...
## $ age_group <fctr> 30-34, 35-39, 40-49, 35-39, 35-39, 30-34, 40-49,...
## $ spend_amount <int> 399, 999, 299, 399, 99, 1499, 99, 399, 299, 549, ...
summary(data)
## user_id time gender
## 1000-0004-9525-FAKE: 15 24/06/2015 03:19: 5 FEMALE :13429
## 1000-0014-9019-FAKE: 14 01/07/2015 06:11: 4 MALE : 2069
## 1000-0015-2677-FAKE: 14 05/07/2015 10:48: 4 UNKNOWN: 283
## 1000-0017-7806-FAKE: 14 07/07/2015 05:03: 4
## 1000-0016-3159-FAKE: 13 07/07/2015 06:47: 4
## 1000-0016-7319-FAKE: 13 08/06/2015 01:32: 4
## (Other) :15698 (Other) :15756
## age_group spend_amount
## 18-24 :2994 Min. : 99.0
## 40-49 :2887 1st Qu.: 99.0
## 50-64 :2532 Median : 299.0
## 25-29 :2298 Mean : 524.6
## 30-34 :1860 3rd Qu.: 549.0
## 35-39 :1817 Max. :4999.0
## (Other):1393
data1 <- group_by(data, gender)
print(data1,n=20)
## Source: local data frame [15,781 x 5]
## Groups: gender [3]
##
## user_id time gender age_group spend_amount
## <fctr> <fctr> <fctr> <fctr> <int>
## 1 1000-0018-1034-FAKE 16/06/2015 12:44 MALE 30-34 399
## 2 1000-0004-1582-FAKE 08/06/2015 05:11 FEMALE 35-39 999
## 3 1000-0005-8033-FAKE 08/06/2015 03:32 FEMALE 40-49 299
## 4 1000-0008-6689-FAKE 08/06/2015 04:20 FEMALE 35-39 399
## 5 1000-0009-4274-FAKE 08/06/2015 17:49 FEMALE 35-39 99
## 6 1000-0009-7488-FAKE 08/06/2015 08:06 FEMALE 30-34 1499
## 7 1000-0009-8265-FAKE 08/06/2015 08:25 FEMALE 40-49 99
## 8 1000-0010-0709-FAKE 08/06/2015 05:50 FEMALE 40-49 399
## 9 1000-0010-4359-FAKE 08/06/2015 05:54 FEMALE 25-29 299
## 10 1000-0010-7904-FAKE 08/06/2015 03:18 FEMALE 18-24 549
## 11 1000-0011-7598-FAKE 08/06/2015 12:29 FEMALE 25-29 999
## 12 1000-0011-7937-FAKE 08/06/2015 00:46 FEMALE 50-64 99
## 13 1000-0011-8958-FAKE 08/06/2015 16:55 FEMALE 35-39 99
## 14 1000-0012-1131-FAKE 08/06/2015 12:08 FEMALE 30-34 299
## 15 1000-0013-5999-FAKE 08/06/2015 16:43 FEMALE 50-64 2999
## 16 1000-0013-9998-FAKE 08/06/2015 07:39 FEMALE 50-64 99
## 17 1000-0014-2544-FAKE 08/06/2015 15:52 MALE 25-29 99
## 18 1000-0015-7158-FAKE 08/06/2015 14:53 FEMALE 18-24 99
## 19 1000-0015-7982-FAKE 08/06/2015 03:13 MALE 40-49 99
## 20 1000-0016-3869-FAKE 08/06/2015 19:15 FEMALE 25-29 549
## # ... with 1.576e+04 more rows
data2 <- summarise(data1,mean=mean(spend_amount))
data2
## # A tibble: 3 × 2
## gender mean
## <fctr> <dbl>
## 1 FEMALE 524.2625
## 2 MALE 525.8004
## 3 UNKNOWN 530.6254
data3 <- summarise(data1,media=median(spend_amount))
data3
## # A tibble: 3 × 2
## gender media
## <fctr> <int>
## 1 FEMALE 299
## 2 MALE 299
## 3 UNKNOWN 299
data2 <- mutate(data, spend_amount = spend_amount/100) # Change to dollars
data3 <- group_by(data2, gender) # Group by gender
data4 <- summarize(data3, # Summarize each group
mean = mean(spend_amount),
median = median(spend_amount),
count = n())
data2%>%
filter(gender!='UNKNOWN')%>%
filter(age_group!='UNKNOWN')%>%
group_by(age_group,gender)%>%
summarise(mean=mean(spend_amount),
median=median(spend_amount),
count=n())
## Source: local data frame [14 x 5]
## Groups: age_group [?]
##
## age_group gender mean median count
## <fctr> <fctr> <dbl> <dbl> <int>
## 1 13-17 FEMALE 5.241904 2.99 919
## 2 13-17 MALE 4.688953 2.99 191
## 3 18-24 FEMALE 5.173824 2.99 2584
## 4 18-24 MALE 5.203415 2.99 410
## 5 25-29 FEMALE 5.283130 2.99 1965
## 6 25-29 MALE 5.821832 2.99 333
## 7 30-34 FEMALE 5.163309 2.99 1656
## 8 30-34 MALE 5.105196 2.99 204
## 9 35-39 FEMALE 5.303657 2.99 1567
## 10 35-39 MALE 5.060000 2.99 250
## 11 40-49 FEMALE 5.226288 2.99 2516
## 12 40-49 MALE 4.918571 2.99 371
## 13 50-64 FEMALE 5.321683 2.99 2222
## 14 50-64 MALE 5.741613 2.99 310
library(ggplot2)
data("diamonds")
diamonds
## # A tibble: 53,940 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39
## # ... with 53,930 more rows
diamonds.m <- mutate(diamonds,price_per_carat=price/carat)
diamonds.m
## # A tibble: 53,940 × 11
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39
## # ... with 53,930 more rows, and 1 more variables: price_per_carat <dbl>
diamonds2 <- group_by(diamonds.m,cut)
diamonds2
## Source: local data frame [53,940 x 11]
## Groups: cut [5]
##
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39
## # ... with 53,930 more rows, and 1 more variables: price_per_carat <dbl>
diamond3 <- summarise(diamonds2,mean.price.per.carat=mean(price_per_carat))
diamond3
## # A tibble: 5 × 2
## cut mean.price.per.carat
## <ord> <dbl>
## 1 Fair 3767.256
## 2 Good 3860.028
## 3 Very Good 4014.128
## 4 Premium 4222.905
## 5 Ideal 3919.700
diamonds %>%
mutate(price_per_carat=price/carat)%>%
group_by(cut)%>%
summarise(mean.price.per.carat=mean(price_per_carat),
median.price.per.carat=median(price_per_carat),
count=n())
## # A tibble: 5 × 4
## cut mean.price.per.carat median.price.per.carat count
## <ord> <dbl> <dbl> <int>
## 1 Fair 3767.256 3449.444 1610
## 2 Good 3860.028 3613.250 4906
## 3 Very Good 4014.128 3605.826 12082
## 4 Premium 4222.905 3763.333 13791
## 5 Ideal 3919.700 3307.143 21551
diamonds %>%
mutate(price_per_carat=price/carat)%>%
group_by(clarity)%>%
summarise(mean.price.per.carat=mean(price_per_carat),
median.price.per.carat=median(price_per_carat),
count=n())
## # A tibble: 8 × 4
## clarity mean.price.per.carat median.price.per.carat count
## <ord> <dbl> <dbl> <int>
## 1 I1 2796.296 2887.387 741
## 2 SI2 4010.854 3951.301 9194
## 3 SI1 3849.078 3669.444 13065
## 4 VS2 4080.527 3428.571 12258
## 5 VS1 4155.817 3450.000 8171
## 6 VVS2 4204.166 3168.750 5066
## 7 VVS1 3851.411 2982.353 3655
## 8 IF 4259.932 3156.458 1790
diamonds %>%
mutate(price_per_carat=price/carat)%>%
group_by(color)%>%
summarise(mean.price.per.carat=mean(price_per_carat),
median.price.per.carat=median(price_per_carat),
count=n())
## # A tibble: 7 × 4
## color mean.price.per.carat median.price.per.carat count
## <ord> <dbl> <dbl> <int>
## 1 D 3952.564 3410.526 6775
## 2 E 3804.611 3253.659 9797
## 3 F 4134.731 3494.316 9542
## 4 G 4163.412 3490.385 11292
## 5 H 4008.027 3818.889 8304
## 6 I 3996.402 3779.739 5422
## 7 J 3825.649 3780.000 2808