First we have to download tidyverse
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Uploading nycflights13
my_data<-nycflights13::flights
head(my_data)
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
tail(my_data)
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 9 30 NA 1842 NA NA 2019
## 2 2013 9 30 NA 1455 NA NA 1634
## 3 2013 9 30 NA 2200 NA NA 2312
## 4 2013 9 30 NA 1210 NA NA 1330
## 5 2013 9 30 NA 1159 NA NA 1344
## 6 2013 9 30 NA 840 NA NA 1020
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
First we will just look at the data on october 14th.
filter(my_data, month==10, day==14)
## # A tibble: 987 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 10 14 451 500 -9 624 648
## 2 2013 10 14 511 517 -6 733 757
## 3 2013 10 14 536 545 -9 814 855
## 4 2013 10 14 540 545 -5 932 933
## 5 2013 10 14 548 545 3 824 827
## 6 2013 10 14 549 600 -11 719 730
## 7 2013 10 14 552 600 -8 650 659
## 8 2013 10 14 553 600 -7 646 700
## 9 2013 10 14 554 600 -6 836 829
## 10 2013 10 14 555 600 -5 832 855
## # … with 977 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
oct_14_flights<-filter(my_data, month==10, day==14)
Use parenthesis if you want it to print and save the variables
(oct_14_flights<-filter(my_data, month==10, day==14))
## # A tibble: 987 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 10 14 451 500 -9 624 648
## 2 2013 10 14 511 517 -6 733 757
## 3 2013 10 14 536 545 -9 814 855
## 4 2013 10 14 540 545 -5 932 933
## 5 2013 10 14 548 545 3 824 827
## 6 2013 10 14 549 600 -11 719 730
## 7 2013 10 14 552 600 -8 650 659
## 8 2013 10 14 553 600 -7 646 700
## 9 2013 10 14 554 600 -6 836 829
## 10 2013 10 14 555 600 -5 832 855
## # … with 977 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
If you want to filter based on different operators, you can use the following:
Finding flights through Sepmtember
(flight_through_september<- filter(my_data, month<10))
## # A tibble: 252,484 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 252,474 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
If we do not want to use the == to mean equals, we get this:
(oct_14_flight2<- filter(my_data, month=10, date=14))
You can also use logical operators to be more selective:
Picking flights in march and april
(march_april_flights<-filter(my_data, month==3 | month==4))
## # A tibble: 57,164 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 3 1 4 2159 125 318 56
## 2 2013 3 1 50 2358 52 526 438
## 3 2013 3 1 117 2245 152 223 2354
## 4 2013 3 1 454 500 -6 633 648
## 5 2013 3 1 505 515 -10 746 810
## 6 2013 3 1 521 530 -9 813 827
## 7 2013 3 1 537 540 -3 856 850
## 8 2013 3 1 541 545 -4 1014 1023
## 9 2013 3 1 549 600 -11 639 703
## 10 2013 3 1 550 600 -10 747 801
## # … with 57,154 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
(and) doesn’t work because flight cannot occur in both months.
(march_24th_flights<-filter(my_data, month==3 & day==24))
## # A tibble: 905 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 3 24 2 2052 190 149 2314
## 2 2013 3 24 8 2359 9 347 338
## 3 2013 3 24 19 2245 94 124 2356
## 4 2013 3 24 36 2155 161 332 48
## 5 2013 3 24 40 2355 45 414 340
## 6 2013 3 24 458 500 -2 658 648
## 7 2013 3 24 515 515 0 823 816
## 8 2013 3 24 543 545 -2 902 923
## 9 2013 3 24 550 600 -10 652 725
## 10 2013 3 24 554 600 -6 848 849
## # … with 895 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Finding flights other than the ones in Janurary
(non_jan_flights<-filter(my_data, month!=1))
## # A tibble: 309,772 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 10 1 447 500 -13 614 648
## 2 2013 10 1 522 517 5 735 757
## 3 2013 10 1 536 545 -9 809 855
## 4 2013 10 1 539 545 -6 801 827
## 5 2013 10 1 539 545 -6 917 933
## 6 2013 10 1 544 550 -6 912 932
## 7 2013 10 1 549 600 -11 653 716
## 8 2013 10 1 550 600 -10 648 700
## 9 2013 10 1 550 600 -10 649 659
## 10 2013 10 1 551 600 -9 727 730
## # … with 309,762 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Arrange allows us to arrange the data set based on the variables desired
example:
arrange(my_data, month, day, year)
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Making our data in descending order:
descending_data<-arrange(my_data, desc(year), desc(month), desc(day))
Note: missing values are always placed at the end of the data frame regardless of ascending or descending.
We can select specific columns we want to look at.
Lets only pull out days
(calendar<-select(my_data, year, month,day))
## # A tibble: 336,776 × 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # … with 336,766 more rows
We can also look at a range of columns.
calendar2<-select(my_data, year:day)
Lets look at columns months:carrier
(calendar2<-select(my_data, month:carrier))
## # A tibble: 336,776 × 9
## month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <dbl> <int> <int>
## 1 1 1 517 515 2 830 819
## 2 1 1 533 529 4 850 830
## 3 1 1 542 540 2 923 850
## 4 1 1 544 545 -1 1004 1022
## 5 1 1 554 600 -6 812 837
## 6 1 1 554 558 -4 740 728
## 7 1 1 555 600 -5 913 854
## 8 1 1 557 600 -3 709 723
## 9 1 1 557 600 -3 838 846
## 10 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 2 more variables: arr_delay <dbl>,
## # carrier <chr>
Taking out columns year through day
(everything_else<-select(my_data, -(year:day)))
## # A tibble: 336,776 × 16
## dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
## <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 517 515 2 830 819 11 UA
## 2 533 529 4 850 830 20 UA
## 3 542 540 2 923 850 33 AA
## 4 544 545 -1 1004 1022 -18 B6
## 5 554 600 -6 812 837 -25 DL
## 6 554 558 -4 740 728 12 UA
## 7 555 600 -5 913 854 19 B6
## 8 557 600 -3 709 723 -14 EV
## 9 557 600 -3 838 846 -8 B6
## 10 558 600 -2 753 745 8 AA
## # … with 336,766 more rows, and 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
or
(everything_else<-select(my_data, !(year:day)))
## # A tibble: 336,776 × 16
## dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
## <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 517 515 2 830 819 11 UA
## 2 533 529 4 850 830 20 UA
## 3 542 540 2 923 850 33 AA
## 4 544 545 -1 1004 1022 -18 B6
## 5 554 600 -6 812 837 -25 DL
## 6 554 558 -4 740 728 12 UA
## 7 555 600 -5 913 854 19 B6
## 8 557 600 -3 709 723 -14 EV
## 9 557 600 -3 838 846 -8 B6
## 10 558 600 -2 753 745 8 AA
## # … with 336,766 more rows, and 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
There are also other helper functions for selecting columns or data:
head(my_data)
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Renaming my_data from departure_time to dep_time.
rename(my_data, departure_time=dep_time)
## # A tibble: 336,776 × 19
## year month day departure_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
my_data<-rename(my_data, departure_time=dep_time)
Adding new columns to data frame with mutate().
Making smaller data frame so we can see what we’re doing
my_data_small<-select(my_data, year:day, distance, air_time)
For mutant ex, lets calculate speed of flights and putting this into my_data.
mutate(my_data_small, speed = distance / air_time*60)
## # A tibble: 336,776 × 6
## year month day distance air_time speed
## <int> <int> <int> <dbl> <dbl> <dbl>
## 1 2013 1 1 1400 227 370.
## 2 2013 1 1 1416 227 374.
## 3 2013 1 1 1089 160 408.
## 4 2013 1 1 1576 183 517.
## 5 2013 1 1 762 116 394.
## 6 2013 1 1 719 150 288.
## 7 2013 1 1 1065 158 404.
## 8 2013 1 1 229 53 259.
## 9 2013 1 1 944 140 405.
## 10 2013 1 1 733 138 319.
## # … with 336,766 more rows
my_data_small<-mutate(my_data_small, speed = distance / air_time*60)
What if we wanted to make a new data frame using only your calculations?
We would use transmute().
(airspeed<-transmute(my_data_small, speed = distance/air_time * 60, speed2=distance/air_time *60))
## # A tibble: 336,776 × 2
## speed speed2
## <dbl> <dbl>
## 1 370. 370.
## 2 374. 374.
## 3 408. 408.
## 4 517. 517.
## 5 394. 394.
## 6 288. 288.
## 7 404. 404.
## 8 259. 259.
## 9 405. 405.
## 10 319. 319.
## # … with 336,766 more rows
We can use summarize to run a function on a data column to get a single return
(summarize(my_data, delay=mean(dep_delay, na.rm=TRUE)))
## # A tibble: 1 × 1
## delay
## <dbl>
## 1 12.6
So we can see here that the average delay is about 12 minutes.
We gain additional value in summarize by pairing it by_group().
by_day<-group_by(my_data, year, month, day)
summarize(by_day, delay=mean(dep_delay, na.rm=TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # … with 355 more rows
As you can see, we now have the delay by the days of the year
What happens if we don’t tell R what to do with the missing data.
summarize(by_day, delay=mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 NA
## 2 2013 1 2 NA
## 3 2013 1 3 NA
## 4 2013 1 4 NA
## 5 2013 1 5 NA
## 6 2013 1 6 NA
## 7 2013 1 7 NA
## 8 2013 1 8 NA
## 9 2013 1 9 NA
## 10 2013 1 10 NA
## # … with 355 more rows
We can also filter our data based on NA( which in this data set is cancelled flights)
not_cancelled<-filter(my_data, !is.na(dep_delay),!is.na(arr_delay))
Let’s run summarize again on this data
summarize(not_cancelled, delay=mean(dep_delay))
## # A tibble: 1 × 1
## delay
## <dbl>
## 1 12.6
We can also count the number of variables that are NA
sum(is.na(my_data$dep_delay))
## [1] 8255
We can also count the numbers that are not NA.
sum(!is.na(my_data$dep_delay))
## [1] 328521
With tibble datasets (more on them soon), we can pipe results to get rid of the need to use the “$” sign.
We can then summarize the # of flights by minutes delayed.
my_data %>%
group_by(year, month, day) %>%
summarize( mean = mean(departure_time, na.rm=TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 1385.
## 2 2013 1 2 1354.
## 3 2013 1 3 1357.
## 4 2013 1 4 1348.
## 5 2013 1 5 1326.
## 6 2013 1 6 1399.
## 7 2013 1 7 1341.
## 8 2013 1 8 1335.
## 9 2013 1 9 1326.
## 10 2013 1 10 1333.
## # … with 355 more rows
First we must upload tibble
library(tibble)
Now we will take the time to explore tibbles. Tibbles are modified data frames which tweak some of the older features from data frames. R is an old language, and useful things from 20 years ago are not as useful anymore.
Lets upload a data frame called iris.
as_tibble(iris)
## # A tibble: 150 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # … with 140 more rows
As we can see, we have the same data frame, but we have different features.
You can also create a tibble from scratch with tibble:
tibble(
x=1:5,
y=1,
z=x^2 + y
)
## # A tibble: 5 × 3
## x y z
## <int> <dbl> <dbl>
## 1 1 1 2
## 2 2 1 5
## 3 3 1 10
## 4 4 1 17
## 5 5 1 26
You can also use tribble() for basic data table creation.
tribble(
~gene.a,~gene.b,~gene.c,
#######################
110, 112, 114,
6, 5, 4
)
## # A tibble: 2 × 3
## gene.a gene.b gene.c
## <dbl> <dbl> <dbl>
## 1 110 112 114
## 2 6 5 4
Tibbles are built to not overwhelm your console when printing data by only showing the first few lines.
This is how a data frame prints:
print(by_day)
## # A tibble: 336,776 × 19
## # Groups: year, month, day [365]
## year month day departure_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
head(by_day)
## # A tibble: 6 × 19
## # Groups: year, month, day [1]
## year month day departure_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Printing flights of nycflights13 when piping it down
nycflights13::flights%>%
print(n=10, width=Inf)
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15
## 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29
## 3 33 AA 1141 N619AA JFK MIA 160 1089 5 40
## 4 -18 B6 725 N804JB JFK BQN 183 1576 5 45
## 5 -25 DL 461 N668DN LGA ATL 116 762 6 0
## 6 12 UA 1696 N39463 EWR ORD 150 719 5 58
## 7 19 B6 507 N516JB EWR FLL 158 1065 6 0
## 8 -14 EV 5708 N829AS LGA IAD 53 229 6 0
## 9 -8 B6 79 N593JB JFK MCO 140 944 6 0
## 10 8 AA 301 N3ALAA LGA ORD 138 733 6 0
## time_hour
## <dttm>
## 1 2013-01-01 05:00:00
## 2 2013-01-01 05:00:00
## 3 2013-01-01 05:00:00
## 4 2013-01-01 05:00:00
## 5 2013-01-01 06:00:00
## 6 2013-01-01 05:00:00
## 7 2013-01-01 06:00:00
## 8 2013-01-01 06:00:00
## 9 2013-01-01 06:00:00
## 10 2013-01-01 06:00:00
## # … with 336,766 more rows
Sub-setting tibbles is easy, similar to data frames. Here is an example of sub-setting nycflights13 and showing carriers:
head(df_tibble<-tibble(nycflights13::flights))
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
head(df_tibble$carrier)
## [1] "UA" "UA" "AA" "B6" "DL" "UA"
We can subset by position using “[[]]”.
df_tibble[[2]]
head(df_tibble[[2]])
## [1] 1 1 1 1 1 1
If you wanted to use this in a pipe, you need to use the “.” placeholder
df_tibble %>%
.$carrier
head(df_tibble %>%
.$carrier)
## [1] "UA" "UA" "AA" "B6" "DL" "UA"
Some older function do not like tibbles…., thus you might have to conver them back to data frame.
Lets see what class our data is first.
class(df_tibble)
## [1] "tbl_df" "tbl" "data.frame"
Now we can same this data into a data fame.
df_tibble_2<-as.data.frame(df_tibble)
When using the class() function, we can now confirm if it converted.
class(df_tibble_2)
## [1] "data.frame"
head(df_tibble_2)
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15
## 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29
## 3 33 AA 1141 N619AA JFK MIA 160 1089 5 40
## 4 -18 B6 725 N804JB JFK BQN 183 1576 5 45
## 5 -25 DL 461 N668DN LGA ATL 116 762 6 0
## 6 12 UA 1696 N39463 EWR ORD 150 719 5 58
## time_hour
## 1 2013-01-01 05:00:00
## 2 2013-01-01 05:00:00
## 3 2013-01-01 05:00:00
## 4 2013-01-01 05:00:00
## 5 2013-01-01 06:00:00
## 6 2013-01-01 05:00:00
Lets upload tidyverse again.
library(tidyverse)
How do we make a tidy dataset? Tidyverse has to follows these three rules: 1. Each variable must have its own column 2. Each observation has its own row 3. Each value has its own cell
Note: It is impossible to satisfy 2/3 of the three rules! This leads to the following instructions for Tidy data
Picking one consistent method of data storage makes for easier understanding of your code and what is happening “under the hood” or behind the scenes.
Lets now look at working with tibbles and piping + mutating it to find bmi.
bmi<-tibble(women)
bmi%>%
mutate(bmi=(703*weight)/(height)^2)
## # A tibble: 15 × 3
## height weight bmi
## <dbl> <dbl> <dbl>
## 1 58 115 24.0
## 2 59 117 23.6
## 3 60 120 23.4
## 4 61 123 23.2
## 5 62 126 23.0
## 6 63 129 22.8
## 7 64 132 22.7
## 8 65 135 22.5
## 9 66 139 22.4
## 10 67 142 22.2
## 11 68 146 22.2
## 12 69 150 22.1
## 13 70 154 22.1
## 14 71 159 22.2
## 15 72 164 22.2
Sometimes we will find data sets that do not fit well with tibble. This is common; we will use the built in data from tidyverse.
table4a
## # A tibble: 3 × 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
As you can see from the data, we have 1 variable (col A) but columns b and c are two of the same. Thus, there are two observations in each row.
To fix this, we can use the gather function.
table4a%>%
gather('1999','2000', key = 'year', value = 'cases')
## # A tibble: 6 × 3
## country year cases
## <chr> <chr> <int>
## 1 Afghanistan 1999 745
## 2 Brazil 1999 37737
## 3 China 1999 212258
## 4 Afghanistan 2000 2666
## 5 Brazil 2000 80488
## 6 China 2000 213766
An other example:
table4b
## # A tibble: 3 × 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 19987071 20595360
## 2 Brazil 172006362 174504898
## 3 China 1272915272 1280428583
Piping table4b data and gathing the years 1999 and 2000
table4b%>%
gather('1999','2000', key='year',value='population')
## # A tibble: 6 × 3
## country year population
## <chr> <chr> <int>
## 1 Afghanistan 1999 19987071
## 2 Brazil 1999 172006362
## 3 China 1999 1272915272
## 4 Afghanistan 2000 20595360
## 5 Brazil 2000 174504898
## 6 China 2000 1280428583
Now if you want to join these two tables: we can use dplyer.
table4a<-table4a%>%
gather('1999','2000', key = 'year', value = 'cases')
table4b<-table4b%>%
gather('1999','2000', key='year',value='population')
left_join(table4a,table4b)
## Joining, by = c("country", "year")
## # A tibble: 6 × 4
## country year cases population
## <chr> <chr> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Brazil 1999 37737 172006362
## 3 China 1999 212258 1272915272
## 4 Afghanistan 2000 2666 20595360
## 5 Brazil 2000 80488 174504898
## 6 China 2000 213766 1280428583
Spreading is the opposite of gathering:
table2
## # A tibble: 12 × 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
We have redundant info in col 1 and 2 that is not very tidy; we can fix by combining rows 1 and 2, 3 and 4, etc.
spread(table2, key=type, value=count)
## # A tibble: 6 × 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
“Type” is the key of what we are turning into columns “value” is what becomes rows
“Spread” makes long tables shorter and wider.
“Gather” makes wide tables more narrow and longer.
Now what happens when we have 2 observations stuck in one column
table3
## # A tibble: 6 × 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
The rate is the populations and cases combined.
We can use seperate to fix this.
table3 %>%
separate(rate, into = c("cases", "population"))
## # A tibble: 6 × 4
## country year cases population
## <chr> <int> <chr> <chr>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
However if you notice, the column type is not correct.
table3%>%
separate(rate, into=c('cases','population'),conver=TRUE)
## # A tibble: 6 × 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
You can specify what you want to separate based on the data.
table3%>%
separate(rate, into=c('cases','population'), sep='/', conver=TRUE)
## # A tibble: 6 × 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
Piping down table3 and seperating century and year.
table3%>%
separate(
year,
into=c('century','year'),
conver=TRUE,
sep=2)
## # A tibble: 6 × 4
## country century year rate
## <chr> <int> <int> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 0 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 0 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 0 213766/1280428583
It is rare that you will be working with a single data table. The DPLYR package allows you to Join 2 data tables based on common values.
Mutate allows you to add new variables to one data frame from the matching observations in another.
Filtering filters observations from one data frame based on whether or not they are present in another.
Set operations treat observations as they are set elements.
Upload both tidyverse and nycflights13:
library(tidyverse)
library(nycflights13)
Lets pull full carrier names based on letter codes.
airlines
## # A tibble: 16 × 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
Lets get info about airports.
airports
## # A tibble: 1,458 × 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/…
## 2 06A Moton Field Municipal Airport 32.5 -85.7 264 -6 A America/…
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/…
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/…
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/…
## 6 0A9 Elizabethton Municipal Airport 36.4 -82.2 1593 -5 A America/…
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/…
## 8 0G7 Finger Lakes Regional Airport 42.9 -76.8 492 -5 A America/…
## 9 0P2 Shoestring Aviation Airfield 39.8 -76.6 1000 -5 U America/…
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/…
## # … with 1,448 more rows
Info about each plane.
planes
## # A tibble: 3,322 × 9
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N10156 2004 Fixed wing multi… EMBRAER EMB-… 2 55 NA Turbo…
## 2 N102UW 1998 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 3 N103US 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 4 N104UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 5 N10575 2002 Fixed wing multi… EMBRAER EMB-… 2 55 NA Turbo…
## 6 N105UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 7 N107US 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 8 N108UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 9 N109UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 10 N110UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## # … with 3,312 more rows
Info on weather.
weather
## # A tibble: 26,115 × 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8
## # … with 26,105 more rows, and 5 more variables: wind_gust <dbl>, precip <dbl>,
## # pressure <dbl>, visib <dbl>, time_hour <dttm>
Info on singular flights.
flights
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Lets see how these tables connect:
These are unique identifiers per observation. Primary key uniquely identifies an observation in its own table. One way to identify a primary key is as follows:
planes%>%
count(tailnum)%>%
filter(n>1)
## # A tibble: 0 × 2
## # … with 2 variables: tailnum <chr>, n <int>
This indicates that the tail number is unique.
planes%>%
count(model)%>%
filter(n>1)
## # A tibble: 79 × 2
## model n
## <chr> <int>
## 1 717-200 88
## 2 737-301 2
## 3 737-3G7 2
## 4 737-3H4 105
## 5 737-3K2 2
## 6 737-3L9 2
## 7 737-3Q8 5
## 8 737-3TO 2
## 9 737-401 4
## 10 737-4B7 18
## # … with 69 more rows
Some times there are no unique identifiers.
Upload planes.
planes
## # A tibble: 3,322 × 9
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N10156 2004 Fixed wing multi… EMBRAER EMB-… 2 55 NA Turbo…
## 2 N102UW 1998 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 3 N103US 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 4 N104UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 5 N10575 2002 Fixed wing multi… EMBRAER EMB-… 2 55 NA Turbo…
## 6 N105UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 7 N107US 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 8 N108UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 9 N109UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 10 N110UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## # … with 3,312 more rows
Now we mutate it to find a certain model.
planes%>%
count(model)%>%
filter(n>1)
## # A tibble: 79 × 2
## model n
## <chr> <int>
## 1 717-200 88
## 2 737-301 2
## 3 737-3G7 2
## 4 737-3H4 105
## 5 737-3K2 2
## 6 737-3L9 2
## 7 737-3Q8 5
## 8 737-3TO 2
## 9 737-401 4
## 10 737-4B7 18
## # … with 69 more rows
Selecting and mutating data to make a new variabel
flights2<-flights%>%
select(year:day, hour, origin, dest,tailnum, carrier)
This is the data we have now:
flights2
## # A tibble: 336,776 × 8
## year month day hour origin dest tailnum carrier
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA
## 2 2013 1 1 5 LGA IAH N24211 UA
## 3 2013 1 1 5 JFK MIA N619AA AA
## 4 2013 1 1 5 JFK BQN N804JB B6
## 5 2013 1 1 6 LGA ATL N668DN DL
## 6 2013 1 1 5 EWR ORD N39463 UA
## 7 2013 1 1 6 EWR FLL N516JB B6
## 8 2013 1 1 6 LGA IAD N829AS EV
## 9 2013 1 1 6 JFK MCO N593JB B6
## 10 2013 1 1 6 LGA ORD N3ALAA AA
## # … with 336,766 more rows
We can pipe this further and add airline to our data.
flights2 %>%
select(-origin, -dest) %>%
left_join(airlines, by = 'carrier')
## # A tibble: 336,776 × 7
## year month day hour tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr>
## 1 2013 1 1 5 N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 N619AA AA American Airlines Inc.
## 4 2013 1 1 5 N804JB B6 JetBlue Airways
## 5 2013 1 1 6 N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 N39463 UA United Air Lines Inc.
## 7 2013 1 1 6 N516JB B6 JetBlue Airways
## 8 2013 1 1 6 N829AS EV ExpressJet Airlines Inc.
## 9 2013 1 1 6 N593JB B6 JetBlue Airways
## 10 2013 1 1 6 N3ALAA AA American Airlines Inc.
## # … with 336,766 more rows
We’ve now added the airline name to our data frame from the airline data frame.
Other types of joins.
What happens if we want to do the inverse of separate. First, lets upload table5
table5
## # A tibble: 6 × 4
## country century year rate
## * <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
We can pipe this down and unite variables in the data set.
table5%>%
unite(data, century ,year, sep="")
## # A tibble: 6 × 3
## country data rate
## <chr> <chr> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
There are 2 types of missing values.
Lets make a tibble to demonstrate this.
gene_data<- tibble(
gene=c('a','a','a','a','b','b','b'),
nuc=c(20,22,24,25,NA,42,67),
run=c(1,2,3,4,2,3,4)
)
Here is what that looks like.
gene_data
## # A tibble: 7 × 3
## gene nuc run
## <chr> <dbl> <dbl>
## 1 a 20 1
## 2 a 22 2
## 3 a 24 3
## 4 a 25 4
## 5 b NA 2
## 6 b 42 3
## 7 b 67 4
One way we can make implicit missing values explicit is by putting runs in columns.
gene_data%>%
spread(gene, nuc)
## # A tibble: 4 × 3
## run a b
## <dbl> <dbl> <dbl>
## 1 1 20 NA
## 2 2 22 NA
## 3 3 24 42
## 4 4 25 67
If we want to remove the missing values, we can use spread, gather, and na.rm=true
gene_data%>%
spread(gene,nuc)%>%
gather(gene,nuc,'a':'b', na.rm=TRUE)
## # A tibble: 6 × 3
## run gene nuc
## <dbl> <chr> <dbl>
## 1 1 a 20
## 2 2 a 22
## 3 3 a 24
## 4 4 a 25
## 5 3 b 42
## 6 4 b 67
Another way that we can make implicit values explicit, is complete().
gene_data%>%
complete(gene,nuc)
## # A tibble: 14 × 3
## gene nuc run
## <chr> <dbl> <dbl>
## 1 a 20 1
## 2 a 22 2
## 3 a 24 3
## 4 a 25 4
## 5 a 42 NA
## 6 a 67 NA
## 7 a NA NA
## 8 b 20 NA
## 9 b 22 NA
## 10 b 24 NA
## 11 b 25 NA
## 12 b 42 3
## 13 b 67 4
## 14 b NA 2
Sometimes an NA is present to reporesent a value being carried forward.
treatment <- tribble(
~ person, ~ treatment, ~ response,
"Isaac", 1, 7,
NA, 2, 10,
NA, 3, 9,
"VDB", 1, 8,
NA, 2, 11,
NA, 3, 10,
)
treatment
## # A tibble: 6 × 3
## person treatment response
## <chr> <dbl> <dbl>
## 1 Isaac 1 7
## 2 <NA> 2 10
## 3 <NA> 3 9
## 4 VDB 1 8
## 5 <NA> 2 11
## 6 <NA> 3 10
We can use the fill() option here.
treatment%>%
fill(person)
## # A tibble: 6 × 3
## person treatment response
## <chr> <dbl> <dbl>
## 1 Isaac 1 7
## 2 Isaac 2 10
## 3 Isaac 3 9
## 4 VDB 1 8
## 5 VDB 2 11
## 6 VDB 3 10
We have to download tidyverse and stringr first.
library(tidyverse)
library(stringr)
you can create strings using single or double quotes.
string1<-"this is a string"
string2<-'to put a "quote"in your string, use the opposite'
Lets see what this looks like.
string1
## [1] "this is a string"
string2
## [1] "to put a \"quote\"in your string, use the opposite"
If you forget to close your string, you’ll get this:
string3<-"where is this string going?"
And here is what this looks like.
string3
## [1] "where is this string going?"
Simply hit escape and try again.
Multiple strings are stored in character vectors.
string4<-c("one","two","three")
string4
## [1] "one" "two" "three"
Now lets measure string length.
str_length(string3)
## [1] 27
We can combine 2 strings now.
str_c("x","y")
## [1] "xy"
Here are what they look like.
str_c(string1,string2)
## [1] "this is a stringto put a \"quote\"in your string, use the opposite"
You can also use a “sep” to control how they are separated.
str_c(string1,string2, sep=" ")
## [1] "this is a string to put a \"quote\"in your string, use the opposite"
Instead of a sep=” “, we now use a sep=”_“.
str_c("x","y","z",sep= "_")
## [1] "x_y_z"
You can subset a string using str_sub() We will demonstrate this by using HSP (head shock proteins).
HSP<-c("HSP123","HSP234","HSP456")
Results:
str_sub(HSP, 4,6)
## [1] "123" "234" "456"
This just drops the first four letters from the strings or you can use negatives to count back from the end
str_sub(HSP,-3,-1)
## [1] "123" "234" "456"
You can convert the cases of strings like follows:
HSP
## [1] "HSP123" "HSP234" "HSP456"
str_to_lower(HSP)
## [1] "hsp123" "hsp234" "hsp456"
or use str_to_upper() to change to upper case.
First, we must install package “htmlwidgets”.
install.packages("htmlwidgets")
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
Then we make a vairable with the following bases.
x<-c('attaga','cgccccggat','tatta')
and then we can find a string that contains the variable we want. Examples:
str_view(x,"g")
str_view(x,"ta")
The next step is “.”, where the “.” matches an entry
str_view(x,".g.")
Anchors allows you to match at the start or the ending.
str_view(x, "^ta")
str_view(x,"ta$")
Character classes/alternatives
Viewing data by using one of these.
str_view(x, "ta[gt]")
[^anc] matches anything BUT a, b or c
str_view(x, "ta[^t]")
You can also use | to pick between two alternatives
str_view(x, "ta[g|t]")
str_detect() reurns a logical vector the same length of input.
y<-c("apple","bannana","pear")
Here is what that looks like.
y
## [1] "apple" "bannana" "pear"
Now we can see if there are any “e’s” in the data we created.
str_detect(y, "e")
## [1] TRUE FALSE TRUE
How manny common words start with letter e. Lets upload common words first.
words
## [1] "a" "able" "about" "absolute" "accept"
## [6] "account" "achieve" "across" "act" "active"
## [11] "actual" "add" "address" "admit" "advertise"
## [16] "affect" "afford" "after" "afternoon" "again"
## [21] "against" "age" "agent" "ago" "agree"
## [26] "air" "all" "allow" "almost" "along"
## [31] "already" "alright" "also" "although" "always"
## [36] "america" "amount" "and" "another" "answer"
## [41] "any" "apart" "apparent" "appear" "apply"
## [46] "appoint" "approach" "appropriate" "area" "argue"
## [51] "arm" "around" "arrange" "art" "as"
## [56] "ask" "associate" "assume" "at" "attend"
## [61] "authority" "available" "aware" "away" "awful"
## [66] "baby" "back" "bad" "bag" "balance"
## [71] "ball" "bank" "bar" "base" "basis"
## [76] "be" "bear" "beat" "beauty" "because"
## [81] "become" "bed" "before" "begin" "behind"
## [86] "believe" "benefit" "best" "bet" "between"
## [91] "big" "bill" "birth" "bit" "black"
## [96] "bloke" "blood" "blow" "blue" "board"
## [101] "boat" "body" "book" "both" "bother"
## [106] "bottle" "bottom" "box" "boy" "break"
## [111] "brief" "brilliant" "bring" "britain" "brother"
## [116] "budget" "build" "bus" "business" "busy"
## [121] "but" "buy" "by" "cake" "call"
## [126] "can" "car" "card" "care" "carry"
## [131] "case" "cat" "catch" "cause" "cent"
## [136] "centre" "certain" "chair" "chairman" "chance"
## [141] "change" "chap" "character" "charge" "cheap"
## [146] "check" "child" "choice" "choose" "Christ"
## [151] "Christmas" "church" "city" "claim" "class"
## [156] "clean" "clear" "client" "clock" "close"
## [161] "closes" "clothe" "club" "coffee" "cold"
## [166] "colleague" "collect" "college" "colour" "come"
## [171] "comment" "commit" "committee" "common" "community"
## [176] "company" "compare" "complete" "compute" "concern"
## [181] "condition" "confer" "consider" "consult" "contact"
## [186] "continue" "contract" "control" "converse" "cook"
## [191] "copy" "corner" "correct" "cost" "could"
## [196] "council" "count" "country" "county" "couple"
## [201] "course" "court" "cover" "create" "cross"
## [206] "cup" "current" "cut" "dad" "danger"
## [211] "date" "day" "dead" "deal" "dear"
## [216] "debate" "decide" "decision" "deep" "definite"
## [221] "degree" "department" "depend" "describe" "design"
## [226] "detail" "develop" "die" "difference" "difficult"
## [231] "dinner" "direct" "discuss" "district" "divide"
## [236] "do" "doctor" "document" "dog" "door"
## [241] "double" "doubt" "down" "draw" "dress"
## [246] "drink" "drive" "drop" "dry" "due"
## [251] "during" "each" "early" "east" "easy"
## [256] "eat" "economy" "educate" "effect" "egg"
## [261] "eight" "either" "elect" "electric" "eleven"
## [266] "else" "employ" "encourage" "end" "engine"
## [271] "english" "enjoy" "enough" "enter" "environment"
## [276] "equal" "especial" "europe" "even" "evening"
## [281] "ever" "every" "evidence" "exact" "example"
## [286] "except" "excuse" "exercise" "exist" "expect"
## [291] "expense" "experience" "explain" "express" "extra"
## [296] "eye" "face" "fact" "fair" "fall"
## [301] "family" "far" "farm" "fast" "father"
## [306] "favour" "feed" "feel" "few" "field"
## [311] "fight" "figure" "file" "fill" "film"
## [316] "final" "finance" "find" "fine" "finish"
## [321] "fire" "first" "fish" "fit" "five"
## [326] "flat" "floor" "fly" "follow" "food"
## [331] "foot" "for" "force" "forget" "form"
## [336] "fortune" "forward" "four" "france" "free"
## [341] "friday" "friend" "from" "front" "full"
## [346] "fun" "function" "fund" "further" "future"
## [351] "game" "garden" "gas" "general" "germany"
## [356] "get" "girl" "give" "glass" "go"
## [361] "god" "good" "goodbye" "govern" "grand"
## [366] "grant" "great" "green" "ground" "group"
## [371] "grow" "guess" "guy" "hair" "half"
## [376] "hall" "hand" "hang" "happen" "happy"
## [381] "hard" "hate" "have" "he" "head"
## [386] "health" "hear" "heart" "heat" "heavy"
## [391] "hell" "help" "here" "high" "history"
## [396] "hit" "hold" "holiday" "home" "honest"
## [401] "hope" "horse" "hospital" "hot" "hour"
## [406] "house" "how" "however" "hullo" "hundred"
## [411] "husband" "idea" "identify" "if" "imagine"
## [416] "important" "improve" "in" "include" "income"
## [421] "increase" "indeed" "individual" "industry" "inform"
## [426] "inside" "instead" "insure" "interest" "into"
## [431] "introduce" "invest" "involve" "issue" "it"
## [436] "item" "jesus" "job" "join" "judge"
## [441] "jump" "just" "keep" "key" "kid"
## [446] "kill" "kind" "king" "kitchen" "knock"
## [451] "know" "labour" "lad" "lady" "land"
## [456] "language" "large" "last" "late" "laugh"
## [461] "law" "lay" "lead" "learn" "leave"
## [466] "left" "leg" "less" "let" "letter"
## [471] "level" "lie" "life" "light" "like"
## [476] "likely" "limit" "line" "link" "list"
## [481] "listen" "little" "live" "load" "local"
## [486] "lock" "london" "long" "look" "lord"
## [491] "lose" "lot" "love" "low" "luck"
## [496] "lunch" "machine" "main" "major" "make"
## [501] "man" "manage" "many" "mark" "market"
## [506] "marry" "match" "matter" "may" "maybe"
## [511] "mean" "meaning" "measure" "meet" "member"
## [516] "mention" "middle" "might" "mile" "milk"
## [521] "million" "mind" "minister" "minus" "minute"
## [526] "miss" "mister" "moment" "monday" "money"
## [531] "month" "more" "morning" "most" "mother"
## [536] "motion" "move" "mrs" "much" "music"
## [541] "must" "name" "nation" "nature" "near"
## [546] "necessary" "need" "never" "new" "news"
## [551] "next" "nice" "night" "nine" "no"
## [556] "non" "none" "normal" "north" "not"
## [561] "note" "notice" "now" "number" "obvious"
## [566] "occasion" "odd" "of" "off" "offer"
## [571] "office" "often" "okay" "old" "on"
## [576] "once" "one" "only" "open" "operate"
## [581] "opportunity" "oppose" "or" "order" "organize"
## [586] "original" "other" "otherwise" "ought" "out"
## [591] "over" "own" "pack" "page" "paint"
## [596] "pair" "paper" "paragraph" "pardon" "parent"
## [601] "park" "part" "particular" "party" "pass"
## [606] "past" "pay" "pence" "pension" "people"
## [611] "per" "percent" "perfect" "perhaps" "period"
## [616] "person" "photograph" "pick" "picture" "piece"
## [621] "place" "plan" "play" "please" "plus"
## [626] "point" "police" "policy" "politic" "poor"
## [631] "position" "positive" "possible" "post" "pound"
## [636] "power" "practise" "prepare" "present" "press"
## [641] "pressure" "presume" "pretty" "previous" "price"
## [646] "print" "private" "probable" "problem" "proceed"
## [651] "process" "produce" "product" "programme" "project"
## [656] "proper" "propose" "protect" "provide" "public"
## [661] "pull" "purpose" "push" "put" "quality"
## [666] "quarter" "question" "quick" "quid" "quiet"
## [671] "quite" "radio" "rail" "raise" "range"
## [676] "rate" "rather" "read" "ready" "real"
## [681] "realise" "really" "reason" "receive" "recent"
## [686] "reckon" "recognize" "recommend" "record" "red"
## [691] "reduce" "refer" "regard" "region" "relation"
## [696] "remember" "report" "represent" "require" "research"
## [701] "resource" "respect" "responsible" "rest" "result"
## [706] "return" "rid" "right" "ring" "rise"
## [711] "road" "role" "roll" "room" "round"
## [716] "rule" "run" "safe" "sale" "same"
## [721] "saturday" "save" "say" "scheme" "school"
## [726] "science" "score" "scotland" "seat" "second"
## [731] "secretary" "section" "secure" "see" "seem"
## [736] "self" "sell" "send" "sense" "separate"
## [741] "serious" "serve" "service" "set" "settle"
## [746] "seven" "sex" "shall" "share" "she"
## [751] "sheet" "shoe" "shoot" "shop" "short"
## [756] "should" "show" "shut" "sick" "side"
## [761] "sign" "similar" "simple" "since" "sing"
## [766] "single" "sir" "sister" "sit" "site"
## [771] "situate" "six" "size" "sleep" "slight"
## [776] "slow" "small" "smoke" "so" "social"
## [781] "society" "some" "son" "soon" "sorry"
## [786] "sort" "sound" "south" "space" "speak"
## [791] "special" "specific" "speed" "spell" "spend"
## [796] "square" "staff" "stage" "stairs" "stand"
## [801] "standard" "start" "state" "station" "stay"
## [806] "step" "stick" "still" "stop" "story"
## [811] "straight" "strategy" "street" "strike" "strong"
## [816] "structure" "student" "study" "stuff" "stupid"
## [821] "subject" "succeed" "such" "sudden" "suggest"
## [826] "suit" "summer" "sun" "sunday" "supply"
## [831] "support" "suppose" "sure" "surprise" "switch"
## [836] "system" "table" "take" "talk" "tape"
## [841] "tax" "tea" "teach" "team" "telephone"
## [846] "television" "tell" "ten" "tend" "term"
## [851] "terrible" "test" "than" "thank" "the"
## [856] "then" "there" "therefore" "they" "thing"
## [861] "think" "thirteen" "thirty" "this" "thou"
## [866] "though" "thousand" "three" "through" "throw"
## [871] "thursday" "tie" "time" "to" "today"
## [876] "together" "tomorrow" "tonight" "too" "top"
## [881] "total" "touch" "toward" "town" "trade"
## [886] "traffic" "train" "transport" "travel" "treat"
## [891] "tree" "trouble" "true" "trust" "try"
## [896] "tuesday" "turn" "twelve" "twenty" "two"
## [901] "type" "under" "understand" "union" "unit"
## [906] "unite" "university" "unless" "until" "up"
## [911] "upon" "use" "usual" "value" "various"
## [916] "very" "video" "view" "village" "visit"
## [921] "vote" "wage" "wait" "walk" "wall"
## [926] "want" "war" "warm" "wash" "waste"
## [931] "watch" "water" "way" "we" "wear"
## [936] "wednesday" "wee" "week" "weigh" "welcome"
## [941] "well" "west" "what" "when" "where"
## [946] "whether" "which" "while" "white" "who"
## [951] "whole" "why" "wide" "wife" "will"
## [956] "win" "wind" "window" "wish" "with"
## [961] "within" "without" "woman" "wonder" "wood"
## [966] "word" "work" "world" "worry" "worse"
## [971] "worth" "would" "write" "wrong" "year"
## [976] "yes" "yesterday" "yet" "you" "young"
We can see how many common words that contain the letter “e”.
sum(str_detect(words,"e"))
## [1] 536
Lets get more complex, what proportion end in a vowel
mean(str_detect(words,"[aeiou]$"))
## [1] 0.2765306
mean(str_detect(words,"^[aeiou]"))
## [1] 0.1785714
Lets find all the words that don’t contain “o” or “u”
no_o<-!str_detect(words, "[ou]")
Here is our variabel that contains “ou”.
no_o
## [1] TRUE TRUE FALSE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE
## [13] TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE
## [25] TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE
## [37] FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
## [49] TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## [61] FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [73] TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE
## [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [145] TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [157] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [205] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [217] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [229] TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE
## [253] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [265] TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE
## [277] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [289] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [301] TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE
## [313] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [325] TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [361] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [373] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [385] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE
## [421] TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE TRUE
## [433] FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [445] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE FALSE
## [457] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [469] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [481] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [505] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## [517] TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE TRUE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [541] FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [553] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [601] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## [613] TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [625] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [637] TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE FALSE
## [649] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [661] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [673] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [685] TRUE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE
## [697] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE
## [709] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
## [721] FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE
## [733] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [745] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [757] FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [769] TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE
## [781] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## [793] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [805] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE
## [817] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [829] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [841] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [853] TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [865] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## [877] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE
## [889] TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE
## [901] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [913] FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [925] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [937] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [949] TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
## [961] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [973] TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE
Now lets extract
words[!str_detect(words, "[ou]")]
## [1] "a" "able" "accept" "achieve" "act"
## [6] "active" "add" "address" "admit" "advertise"
## [11] "affect" "after" "again" "against" "age"
## [16] "agent" "agree" "air" "all" "already"
## [21] "alright" "always" "america" "and" "answer"
## [26] "any" "apart" "apparent" "appear" "apply"
## [31] "area" "arm" "arrange" "art" "as"
## [36] "ask" "at" "attend" "available" "aware"
## [41] "away" "baby" "back" "bad" "bag"
## [46] "balance" "ball" "bank" "bar" "base"
## [51] "basis" "be" "bear" "beat" "bed"
## [56] "begin" "behind" "believe" "benefit" "best"
## [61] "bet" "between" "big" "bill" "birth"
## [66] "bit" "black" "break" "brief" "brilliant"
## [71] "bring" "britain" "by" "cake" "call"
## [76] "can" "car" "card" "care" "carry"
## [81] "case" "cat" "catch" "cent" "centre"
## [86] "certain" "chair" "chairman" "chance" "change"
## [91] "chap" "character" "charge" "cheap" "check"
## [96] "child" "Christ" "Christmas" "city" "claim"
## [101] "class" "clean" "clear" "client" "create"
## [106] "dad" "danger" "date" "day" "dead"
## [111] "deal" "dear" "debate" "decide" "deep"
## [116] "definite" "degree" "department" "depend" "describe"
## [121] "design" "detail" "die" "difference" "dinner"
## [126] "direct" "district" "divide" "draw" "dress"
## [131] "drink" "drive" "dry" "each" "early"
## [136] "east" "easy" "eat" "effect" "egg"
## [141] "eight" "either" "elect" "electric" "eleven"
## [146] "else" "end" "engine" "english" "enter"
## [151] "especial" "even" "evening" "ever" "every"
## [156] "evidence" "exact" "example" "except" "exercise"
## [161] "exist" "expect" "expense" "experience" "explain"
## [166] "express" "extra" "eye" "face" "fact"
## [171] "fair" "fall" "family" "far" "farm"
## [176] "fast" "father" "feed" "feel" "few"
## [181] "field" "fight" "file" "fill" "film"
## [186] "final" "finance" "find" "fine" "finish"
## [191] "fire" "first" "fish" "fit" "five"
## [196] "flat" "fly" "france" "free" "friday"
## [201] "friend" "game" "garden" "gas" "general"
## [206] "germany" "get" "girl" "give" "glass"
## [211] "grand" "grant" "great" "green" "hair"
## [216] "half" "hall" "hand" "hang" "happen"
## [221] "happy" "hard" "hate" "have" "he"
## [226] "head" "health" "hear" "heart" "heat"
## [231] "heavy" "hell" "help" "here" "high"
## [236] "hit" "idea" "identify" "if" "imagine"
## [241] "in" "increase" "indeed" "inside" "instead"
## [246] "interest" "invest" "it" "item" "keep"
## [251] "key" "kid" "kill" "kind" "king"
## [256] "kitchen" "lad" "lady" "land" "large"
## [261] "last" "late" "law" "lay" "lead"
## [266] "learn" "leave" "left" "leg" "less"
## [271] "let" "letter" "level" "lie" "life"
## [276] "light" "like" "likely" "limit" "line"
## [281] "link" "list" "listen" "little" "live"
## [286] "machine" "main" "make" "man" "manage"
## [291] "many" "mark" "market" "marry" "match"
## [296] "matter" "may" "maybe" "mean" "meaning"
## [301] "meet" "member" "middle" "might" "mile"
## [306] "milk" "mind" "minister" "miss" "mister"
## [311] "mrs" "name" "near" "necessary" "need"
## [316] "never" "new" "news" "next" "nice"
## [321] "night" "nine" "pack" "page" "paint"
## [326] "pair" "paper" "paragraph" "parent" "park"
## [331] "part" "party" "pass" "past" "pay"
## [336] "pence" "per" "percent" "perfect" "perhaps"
## [341] "pick" "piece" "place" "plan" "play"
## [346] "please" "practise" "prepare" "present" "press"
## [351] "pretty" "price" "print" "private" "rail"
## [356] "raise" "range" "rate" "rather" "read"
## [361] "ready" "real" "realise" "really" "receive"
## [366] "recent" "red" "refer" "regard" "remember"
## [371] "represent" "research" "respect" "rest" "rid"
## [376] "right" "ring" "rise" "safe" "sale"
## [381] "same" "save" "say" "scheme" "science"
## [386] "seat" "secretary" "see" "seem" "self"
## [391] "sell" "send" "sense" "separate" "serve"
## [396] "service" "set" "settle" "seven" "sex"
## [401] "shall" "share" "she" "sheet" "sick"
## [406] "side" "sign" "similar" "simple" "since"
## [411] "sing" "single" "sir" "sister" "sit"
## [416] "site" "six" "size" "sleep" "slight"
## [421] "small" "space" "speak" "special" "specific"
## [426] "speed" "spell" "spend" "staff" "stage"
## [431] "stairs" "stand" "standard" "start" "state"
## [436] "stay" "step" "stick" "still" "straight"
## [441] "strategy" "street" "strike" "switch" "system"
## [446] "table" "take" "talk" "tape" "tax"
## [451] "tea" "teach" "team" "tell" "ten"
## [456] "tend" "term" "terrible" "test" "than"
## [461] "thank" "the" "then" "there" "they"
## [466] "thing" "think" "thirteen" "thirty" "this"
## [471] "three" "tie" "time" "trade" "traffic"
## [476] "train" "travel" "treat" "tree" "try"
## [481] "twelve" "twenty" "type" "very" "view"
## [486] "village" "visit" "wage" "wait" "walk"
## [491] "wall" "want" "war" "warm" "wash"
## [496] "waste" "watch" "water" "way" "we"
## [501] "wear" "wednesday" "wee" "week" "weigh"
## [506] "well" "west" "what" "when" "where"
## [511] "whether" "which" "while" "white" "why"
## [516] "wide" "wife" "will" "win" "wind"
## [521] "wish" "with" "within" "write" "year"
## [526] "yes" "yesterday" "yet"
You can also use str_count() to say how many matches there are in string.
x
## [1] "attaga" "cgccccggat" "tatta"
str_count(x, "[gc]")
## [1] 1 8 0
Lets couple this with mutate
df<-tibble(
word=words,
count=seq_along(word)
)
This is what it looks like now.
df
## # A tibble: 980 × 2
## word count
## <chr> <int>
## 1 a 1
## 2 able 2
## 3 about 3
## 4 absolute 4
## 5 accept 5
## 6 account 6
## 7 achieve 7
## 8 across 8
## 9 act 9
## 10 active 10
## # … with 970 more rows
We can mutate this to see how many words in our data frame has the selected vowels or constonants.
df%>%
mutate(
vowels=str_count(words,"[aeiou]"),
constonants=str_count(words,"[^aeiou]"))
## # A tibble: 980 × 4
## word count vowels constonants
## <chr> <int> <int> <int>
## 1 a 1 1 0
## 2 able 2 2 2
## 3 about 3 3 2
## 4 absolute 4 4 4
## 5 accept 5 2 4
## 6 account 6 3 4
## 7 achieve 7 4 3
## 8 across 8 2 4
## 9 act 9 1 2
## 10 active 10 3 3
## # … with 970 more rows
Layout on how to relay data from a set Directory:
setwd(“C:/Users/THE_G/OneDrive/Desktop/BISC_450c/Bric16”)
setwd(“~/Desktop/classroom/myfiles”)
First, let us download all of the required packages:
library(ath1121501.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: org.At.tair.db
##
##
library(ath1121501cdf)
## Warning: replacing previous import 'AnnotationDbi::tail' by 'utils::tail' when
## loading 'ath1121501cdf'
## Warning: replacing previous import 'AnnotationDbi::head' by 'utils::head' when
## loading 'ath1121501cdf'
##
library(annotate)
## Loading required package: XML
BiocManager::install("affy")
## 'getOption("repos")' replaces Bioconductor standard repositories, see
## '?repositories' for details
##
## replacement repositories:
## CRAN: https://cloud.r-project.org
## Bioconductor version 3.14 (BiocManager 1.30.16), R 4.1.3 (2022-03-10)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
## re-install: 'affy'
## Installation paths not writeable, unable to update packages
## path: /usr/lib/R/library
## packages:
## cluster, MASS, Matrix, mgcv, nlme, spatial, survival
## Old packages: 'ade4', 'adegenet', 'aplot', 'bbmle', 'BiocManager', 'blob',
## 'broom', 'checkmate', 'circlize', 'cli', 'dichromat', 'dplyr', 'DT', 'ff',
## 'formatR', 'ggfun', 'ggplot2', 'googleVis', 'gplots', 'Gviz', 'haven',
## 'Hmisc', 'httr', 'hwriter', 'igraph', 'interp', 'kernlab', 'knitr', 'limma',
## 'magrittr', 'matrixStats', 'mclust', 'minpack.lm', 'openssl', 'phytools',
## 'ps', 'RColorBrewer', 'RcppArmadillo', 'RcppEigen', 'reshape', 'rgl',
## 'rmarkdown', 'RNeXML', 'RSQLite', 'scales', 'segmented', 'seqinr', 'shinyBS',
## 'sp', 'systemPipeR', 'tibble', 'tinytex', 'uuid', 'vctrs', 'vegan', 'xfun',
## 'zoo'
library(affy)
library(affyio)
library(limma)
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(oligo)
## Loading required package: oligoClasses
## Welcome to oligoClasses version 1.56.0
##
## Attaching package: 'oligoClasses'
## The following object is masked from 'package:affy':
##
## list.celfiles
## Loading required package: Biostrings
## Loading required package: XVector
##
## Attaching package: 'XVector'
## The following object is masked from 'package:purrr':
##
## compact
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## ================================================================================
## Welcome to oligo version 1.58.0
## ================================================================================
##
## Attaching package: 'oligo'
## The following object is masked from 'package:limma':
##
## backgroundCorrect
## The following objects are masked from 'package:affy':
##
## intensity, MAplot, mm, mm<-, mmindex, pm, pm<-, pmindex,
## probeNames, rma
## The following object is masked from 'package:dplyr':
##
## summarize
library(dplyr)
library(stats)
library(reshape)
##
## Attaching package: 'reshape'
## The following objects are masked from 'package:S4Vectors':
##
## expand, rename
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
Read the cel files into the directory
targets<- readTargets("Bric16_Targets.csv", sep=",", row.names="filename")
ab<-ReadAffy()
We can make a histogram from the data.
hist(ab)
Normalizing the microarray experiments.
eset<-affy::rma(ab)
## Background correcting
## Normalizing
## Calculating Expression
pData(eset)
## sample
## Atha_Ler-0_sShoots_FLT_Rep1_F-F2_raw.CEL 1
## Atha_Ler-0_sShoots_FLT_Rep2_F-F3_raw.CEL 2
## Atha_Ler-0_sShoots_FLT_Rep3_F-F4_raw.CEL 3
## Atha_Ler-0_sShoots_GC_Rep1_H-F2_raw.CEL 4
## Atha_Ler-0_sShoots_GC_Rep2_H-F3_raw.CEL 5
## Atha_Ler-0_sShoots_GC_Rep3_H-F4_raw.CEL 6
Lets visualize the normalized data in a histogram
hist(eset)
Lets characterize the data a bit
ID<- featureNames(eset)
Symbol<-getSYMBOL(ID, "ath1121501.db")
head(Symbol)
## 244901_at 244902_at 244903_at 244904_at 244905_at 244906_at
## "ORF25" "NAD4L" "ORF149" "ORF275" "ORF122C" "ORF240A"
Now we have to make a varriable with the file of choice.
affydata<-read.delim("affy_ATH1_array_elements.txt")
flight vs ground:
condition<-targets$gravity
design<-model.matrix(~factor(condition))
colnames(design)<-c("flight","ground")
Then we make different variables with the selected data.
fit<-lmFit(eset,design)
fit<-eBayes(fit)
options(digits=2)
top<-topTable(fit,coef=2,n=Inf,adjust='fdr')
This is how we can combine different annotations.
annot<-data.frame(GENE=sapply(contents(ath1121501GENENAME), paste,collapse=", "),
KEGG=sapply(contents(ath1121501PATH), paste, collapse=", "),
GO=sapply(contents(ath1121501GO), paste, collapse=", "),
SYMBOL=sapply(contents(ath1121501SYMBOL), paste,collapse=", "),
ACCNUM=sapply(contents(ath1121501ACCNUM), paste, collapse=", "))
Lets merge all the data into on edata frame
all<-merge(annot, top, by.x=0, by.y=0, all=TRUE)
all2<-merge(all,affydata, by.x="Row.names", by.y="array_element_name")
Lets convert the ACCNUM to a character value
all2$ACCNUM<- as.character(all2$ACCNUM)
Then we can set parameters for the row and columns.
write.table(all2, file="BRIC_16_Final.csv", row.names=TRUE, col.names=TRUE, sep="\t")
columns(org.At.tair.db)
## [1] "ARACYC" "ARACYCENZYME" "ENTREZID" "ENZYME" "EVIDENCE"
## [6] "EVIDENCEALL" "GENENAME" "GO" "GOALL" "ONTOLOGY"
## [11] "ONTOLOGYALL" "PATH" "PMID" "REFSEQ" "SYMBOL"
## [16] "TAIR"
We can use a foldchange for a variable and custamize the data table
foldchanges<-as.data.frame(all2$logFC)
all2$entrez=mapIds(org.At.tair.db,
keys=all2$ACCNUM,
column="ENTREZID",
keytype="TAIR",
multivals="first")
## 'select()' returned 1:1 mapping between keys and columns
This is what the head of the data frame looks like.
head(all2,10)
## Row.names
## 1 244901_at
## 2 244902_at
## 3 244903_at
## 4 244904_at
## 5 244905_at
## 6 244906_at
## 7 244907_at
## 8 244908_at
## 9 244909_at
## 10 244910_s_at
## GENE
## 1 encodes a plant b subunit of mitochondrial ATP synthase based on structural similarity and the presence in the F(0) complex.
## 2 Encodes NADH dehydrogenase subunit 4L.
## 3 hypothetical protein
## 4 hypothetical protein
## 5 hypothetical protein
## 6 hypothetical protein
## 7 hypothetical protein
## 8 hypothetical protein
## 9 hypothetical protein
## 10 hypothetical protein
## KEGG
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## 7 NA
## 8 NA
## 9 NA
## 10 NA
## GO
## 1 list(GOID = "GO:0015986", Evidence = "TAS", Ontology = "BP"), list(GOID = "GO:0015986", Evidence = "IEA", Ontology = "BP"), list(GOID = "GO:0015986", Evidence = "TAS", Ontology = "BP"), list(GOID = "GO:0015986", Evidence = "IEA", Ontology = "BP"), list(GOID = "GO:0015986", Evidence = "TAS", Ontology = "BP"), list(GOID = "GO:0015986", Evidence = "TAS", Ontology = "BP"), list(GOID = "GO:0015986", Evidence = "IEA", Ontology = "BP"), list(GOID = "GO:0015986", Evidence = "IEA", Ontology = "BP"), list(GOID = "GO:0015986", Evidence = "IEA", Ontology = "BP"), list(GOID = "GO:0015986", Evidence = "TAS", Ontology = "BP"), list(GOID = "GO:0000276", Evidence = "IDA", Ontology = "CC"), list(GOID = "GO:0000276", Evidence = "IBA", Ontology = "CC"), list(GOID = "GO:0000276", Evidence = "IBA", Ontology = "CC"), list(GOID = "GO:0000276", Evidence = "IBA", Ontology = "CC"), list(GOID = "GO:0000276", Evidence = "IDA", Ontology = "CC"), list(GOID = "GO:0000276", Evidence = "IDA", Ontology = "CC"), list(GOID = "GO:0005739", Evidence = "HDA", Ontology = "CC"), list(GOID = "GO:0005739", Evidence = "HDA", Ontology = "CC"), list(GOID = "GO:0005739", Evidence = "HDA", Ontology = "CC"), list(GOID = "GO:0005739", Evidence = "ISM", Ontology = "CC"), list(GOID = "GO:0005753", Evidence = "HDA", Ontology = "CC"), list(GOID = "GO:0005753", Evidence = "HDA", Ontology = "CC"), list(GOID = "GO:0005753", Evidence = "HDA", Ontology = "CC"), list(GOID = "GO:0008270", Evidence = "HDA", Ontology = "MF"), list(GOID = "GO:0015078", Evidence = "IEA", Ontology = "MF"), list(GOID = "GO:0046933", Evidence = "TAS", Ontology = "MF"), list(GOID = "GO:0046933", Evidence = "TAS", Ontology = "MF")
## 2 list(GOID = "GO:0042773", Evidence = "IEA", Ontology = "BP"), list(GOID = "GO:0042773", Evidence = "IEA", Ontology = "BP"), list(GOID = "GO:0042773", Evidence = "IEA", Ontology = "BP"), list(GOID = "GO:0045333", Evidence = "TAS", Ontology = "BP"), list(GOID = "GO:0005739", Evidence = "TAS", Ontology = "CC"), list(GOID = "GO:0005739", Evidence = "ISM", Ontology = "CC"), list(GOID = "GO:0030964", Evidence = "IBA", Ontology = "CC"), list(GOID = "GO:0045271", Evidence = "TAS", Ontology = "CC"), list(GOID = "GO:0045272", Evidence = "IBA", Ontology = "CC"), list(GOID = "GO:0045272", Evidence = "IBA", Ontology = "CC"), list(GOID = "GO:0003954", Evidence = "TAS", Ontology = "MF"), list(GOID = "GO:0008137", Evidence = "IBA", Ontology = "MF")
## 3 list(GOID = "GO:0008150", Evidence = "ND", Ontology = "BP"), list(GOID = "GO:0005739", Evidence = "ISM", Ontology = "CC"), list(GOID = "GO:0003674", Evidence = "ND", Ontology = "MF")
## 4 list(GOID = "GO:0008150", Evidence = "ND", Ontology = "BP"), list(GOID = "GO:0005739", Evidence = "ISM", Ontology = "CC"), list(GOID = "GO:0003674", Evidence = "ND", Ontology = "MF")
## 5 list(GOID = "GO:0008150", Evidence = "ND", Ontology = "BP"), list(GOID = "GO:0005739", Evidence = "ISM", Ontology = "CC"), list(GOID = "GO:0003674", Evidence = "ND", Ontology = "MF")
## 6 list(GOID = "GO:0005739", Evidence = "ISM", Ontology = "CC")
## 7 list(GOID = "GO:0008150", Evidence = "ND", Ontology = "BP"), list(GOID = "GO:0005739", Evidence = "ISM", Ontology = "CC")
## 8 list(GOID = "GO:0008150", Evidence = "ND", Ontology = "BP"), list(GOID = "GO:0005739", Evidence = "ISM", Ontology = "CC"), list(GOID = "GO:0003674", Evidence = "ND", Ontology = "MF")
## 9 list(GOID = "GO:0008150", Evidence = "ND", Ontology = "BP"), list(GOID = "GO:0005739", Evidence = "ISM", Ontology = "CC"), list(GOID = "GO:0003674", Evidence = "ND", Ontology = "MF")
## 10 list(GOID = "GO:0008150", Evidence = "ND", Ontology = "BP"), list(GOID = "GO:0005739", Evidence = "ISM", Ontology = "CC"), list(GOID = "GO:0003674", Evidence = "ND", Ontology = "MF")
## SYMBOL ACCNUM logFC AveExpr t P.Value adj.P.Val B
## 1 ORF25 ATMG00640 -0.742 8.5 -3.74 0.0069 0.081 -2.5
## 2 NAD4L ATMG00650 -0.826 7.5 -3.55 0.0090 0.094 -2.7
## 3 ORF149 ATMG00660 -1.214 7.8 -6.20 0.0004 0.018 0.5
## 4 ORF275 ATMG00670 -0.568 3.2 -2.07 0.0758 0.300 -4.9
## 5 ORF122C ATMG00680 0.120 1.4 0.60 0.5661 0.794 -6.6
## 6 ORF240A ATMG00690 0.055 9.1 0.18 0.8586 0.947 -6.8
## 7 ORF120 ATMG00710 -0.883 4.3 -2.91 0.0220 0.154 -3.7
## 8 ORF107D ATMG00720 -0.463 2.2 -1.73 0.1262 0.396 -5.4
## 9 ORF100A ATMG00740 -0.510 1.5 -3.18 0.0150 0.125 -3.3
## 10 ORF119 ATMG00750 -0.694 1.9 -2.09 0.0744 0.297 -4.9
## array_element_type organism is_control locus
## 1 oligonucleotide Arabidopsis thaliana no ATMG00640
## 2 oligonucleotide Arabidopsis thaliana no ATMG00650
## 3 oligonucleotide Arabidopsis thaliana no ATMG00660
## 4 oligonucleotide Arabidopsis thaliana no ATMG00670
## 5 oligonucleotide Arabidopsis thaliana no ATMG00680
## 6 oligonucleotide Arabidopsis thaliana no ATMG00690
## 7 oligonucleotide Arabidopsis thaliana no ATMG00710
## 8 oligonucleotide Arabidopsis thaliana no ATMG00720
## 9 oligonucleotide Arabidopsis thaliana no ATMG00740
## 10 oligonucleotide Arabidopsis thaliana no ATMG00750;AT2G07686
## description
## 1 hydrogen ion transporting ATP synthases, rotational mechanism;zinc ion binding
## 2 NADH dehydrogenase subunit 4L
## 3 hypothetical protein
## 4 hypothetical protein
## 5 hypothetical protein
## 6 hypothetical protein
## 7 Polynucleotidyl transferase, ribonuclease H-like superfamily protein
## 8 hypothetical protein
## 9 hypothetical protein
## 10 [ATMG00750, GAG/POL/ENV polyprotein];[AT2G07686, transposable element gene]
## chromosome start
## 1 M 188160
## 2 M 188954
## 3 M 190106
## 4 M 191055
## 5 M 201768
## 6 M 203634
## 7 M 207571
## 8 M 209500
## 9 M 220521
## 10 [ATMG00750, M];[AT2G07686, 2] [ATMG00750, 220851];[AT2G07686, 3309747]
## stop entrez
## 1 188619 NA
## 2 189182 NA
## 3 190540 NA
## 4 191627 NA
## 5 202096 NA
## 6 204043 NA
## 7 207882 NA
## 8 209821 NA
## 9 220769 NA
## 10 [ATMG00750, 221184];[AT2G07686, 3310080] NA
We have to download the required packages first.
library(pathview)
##
## ##############################################################################
## Pathview is an open source software package distributed under GNU General
## Public License version 3 (GPLv3). Details of GPLv3 is available at
## http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
## formally cite the original Pathview paper (not just mention it) in publications
## or products. For details, do citation("pathview") within R.
##
## The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
## license agreement (details at http://www.kegg.jp/kegg/legal.html).
## ##############################################################################
library(gage)
##
library(gageData)
Now we can use certain functions and apply it toward our data such as kegg.sets.hs.
data(kegg.sets.hs)
foldchanges=all2$logFC
names(foldchanges)=all2$entrez
Here is what that looks like.
head(foldchanges)
## <NA> <NA> <NA> <NA> <NA> <NA>
## -0.742 -0.826 -1.214 -0.568 0.120 0.055
Mapping genes by id.typing
kegg.ath=kegg.gsets("ath", id.type="entrez")
kegg.ath.sigmet=kegg.ath$kg.sets[kegg.ath$sigmet.idx]
Lets get the results
keggres=gage(foldchanges, gsets=kegg.ath.sigmet,same.dir=TRUE)
We can also lapply our results.
lapply(keggres,head)
## $greater
## p.geomean stat.mean p.val q.val
## ath03010 Ribosome 2.4e-27 11.5 2.4e-27 2.7e-25
## ath00195 Photosynthesis 7.6e-06 4.6 7.6e-06 4.3e-04
## ath04145 Phagosome 3.2e-05 4.1 3.2e-05 1.2e-03
## ath01230 Biosynthesis of amino acids 5.5e-05 3.9 5.5e-05 1.5e-03
## ath00020 Citrate cycle (TCA cycle) 1.1e-03 3.1 1.1e-03 2.5e-02
## ath00190 Oxidative phosphorylation 2.2e-03 2.9 2.2e-03 3.4e-02
## set.size exp1
## ath03010 Ribosome 261 2.4e-27
## ath00195 Photosynthesis 44 7.6e-06
## ath04145 Phagosome 71 3.2e-05
## ath01230 Biosynthesis of amino acids 201 5.5e-05
## ath00020 Citrate cycle (TCA cycle) 58 1.1e-03
## ath00190 Oxidative phosphorylation 103 2.2e-03
##
## $less
## p.geomean stat.mean p.val
## ath04016 MAPK signaling pathway - plant 0.0035 -2.7 0.0035
## ath00906 Carotenoid biosynthesis 0.0198 -2.1 0.0198
## ath04141 Protein processing in endoplasmic reticulum 0.0283 -1.9 0.0283
## ath00592 alpha-Linolenic acid metabolism 0.0552 -1.6 0.0552
## ath00071 Fatty acid degradation 0.1160 -1.2 0.1160
## ath04122 Sulfur relay system 0.1415 -1.1 0.1415
## q.val set.size exp1
## ath04016 MAPK signaling pathway - plant 0.4 129 0.0035
## ath00906 Carotenoid biosynthesis 1.0 28 0.0198
## ath04141 Protein processing in endoplasmic reticulum 1.0 189 0.0283
## ath00592 alpha-Linolenic acid metabolism 1.0 36 0.0552
## ath00071 Fatty acid degradation 1.0 45 0.1160
## ath04122 Sulfur relay system 1.0 11 0.1415
##
## $stats
## stat.mean exp1
## ath03010 Ribosome 11.5 11.5
## ath00195 Photosynthesis 4.6 4.6
## ath04145 Phagosome 4.1 4.1
## ath01230 Biosynthesis of amino acids 3.9 3.9
## ath00020 Citrate cycle (TCA cycle) 3.1 3.1
## ath00190 Oxidative phosphorylation 2.9 2.9
Lets make a greater and lesser version of our results.
greater<-keggres$greater
lessers<-keggres$less
And now we can write it back based on our directory.
write.csv(greater, file="Bric_16_Greater_Pathways.csv")
write.csv(lessers, file="Bric_16_Greater_Pathways.csv")
Lets map the pathways (greater)
keggrespathways=data.frame(id=rownames(keggres$greater), keggres$greater)%>%
tbl_df()%>%
filter(row_number() <=5) %>%
.$id %>%
as.character()
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
keggresids_greater=substr(keggrespathways, start=1, stop=8)
Tis is what it looks like.
keggresids_greater
## [1] "ath03010" "ath00195" "ath04145" "ath01230" "ath00020"
Using as.character() for our data.
genedata<- as.character (all2$logFC)
Define a plottin gfunction to apply ect.
plot_pathway=function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="ath", new.signature=FALSE)
Plot multiple pathways plots are saved to disk and returns a throwaway abject list.
tmp=sapply(keggresids_greater, function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="ath"))
## Info: Getting gene ID data from KEGG...
## Info: Done with data retrieval!
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file ath03010.pathview.png
## Info: Getting gene ID data from KEGG...
## Info: Done with data retrieval!
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file ath00195.pathview.png
## Info: Getting gene ID data from KEGG...
## Info: Done with data retrieval!
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file ath04145.pathview.png
## Info: Getting gene ID data from KEGG...
## Info: Done with data retrieval!
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file ath01230.pathview.png
## Info: Getting gene ID data from KEGG...
## Info: Done with data retrieval!
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file ath00020.pathview.png
Lets map the pathways (less).
keggrespathways=data.frame(id=rownames(keggres$greater), keggres$less)%>%
tbl_df()%>%
filter(row_number() <=5) %>%
.$id %>%
as.character()
keggresids_less=substr(keggrespathways, start=1, stop=8)
This is what it looks like.
keggresids_less
## [1] "ath03010" "ath00195" "ath04145" "ath01230" "ath00020"
Using our lesser pathway and applying it to genedata.
genedata<- as.character (all2$logFC)
Define a plotting function to apply ect.
plot_pathway=function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="ath", new.signature=FALSE)
Plot multiple pathways plots are saved to disk and returns a throwaway abject list
tmp=sapply(keggresids_less, function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="ath"))
## Info: Getting gene ID data from KEGG...
## Info: Done with data retrieval!
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file ath03010.pathview.png
## Info: Getting gene ID data from KEGG...
## Info: Done with data retrieval!
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file ath00195.pathview.png
## Info: Getting gene ID data from KEGG...
## Info: Done with data retrieval!
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file ath04145.pathview.png
## Info: Getting gene ID data from KEGG...
## Info: Done with data retrieval!
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file ath01230.pathview.png
## Info: Getting gene ID data from KEGG...
## Info: Done with data retrieval!
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file ath00020.pathview.png
First, we must download the following packages:
library("edgeR")
library("dplyr")
library("AnnotationDbi")
library("org.Mm.eg.db")
##
Now we are creating a group of target file.
rawdata=read.csv("GLDS-102_rna_seq_Normalized_Counts.csv", row.names="gene_id")
group<-factor(c('1','1','1','1','1','1','2','2','2','2','2','2'))
dgeGlm<-DGEList(counts=rawdata,group=group)
str(dgeGlm)
## Formal class 'DGEList' [package "edgeR"] with 1 slot
## ..@ .Data:List of 2
## .. ..$ : num [1:24035, 1:12] 2976.8 59.8 21.2 40.1 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:24035] "ENSMUSG00000000001" "ENSMUSG00000000028" "ENSMUSG00000000031" "ENSMUSG00000000037" ...
## .. .. .. ..$ : chr [1:12] "Mmus_C57.6J_KDN_FLT_Rep1_M23" "Mmus_C57.6J_KDN_FLT_Rep2_M24" "Mmus_C57.6J_KDN_FLT_Rep3_M25" "Mmus_C57.6J_KDN_FLT_Rep4_M26" ...
## .. ..$ :'data.frame': 12 obs. of 3 variables:
## .. .. ..$ group : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 2 2 2 2 ...
## .. .. ..$ lib.size : num [1:12] 40266365 40740336 37739541 39196969 36820645 ...
## .. .. ..$ norm.factors: num [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ names: chr [1:2] "counts" "samples"
str(group)
## Factor w/ 2 levels "1","2": 1 1 1 1 1 1 2 2 2 2 ...
keep<-rowSums(cpm(dgeGlm)>2)>=4
Setting variable of keep, then dep of true/false means to keep.
dgeGlm<-dgeGlm[keep,]
Looking a names of dgeGlm.
names(dgeGlm)
## [1] "counts" "samples"
Using dgeGLm[[]].
dgeGlm[["samples"]]
## group lib.size norm.factors
## Mmus_C57.6J_KDN_FLT_Rep1_M23 1 4.0e+07 1
## Mmus_C57.6J_KDN_FLT_Rep2_M24 1 4.1e+07 1
## Mmus_C57.6J_KDN_FLT_Rep3_M25 1 3.8e+07 1
## Mmus_C57.6J_KDN_FLT_Rep4_M26 1 3.9e+07 1
## Mmus_C57.6J_KDN_FLT_Rep5_M27 1 3.7e+07 1
## Mmus_C57.6J_KDN_FLT_Rep6_M28 1 3.6e+07 1
## Mmus_C57.6J_KDN_GC_Rep1_M33 2 3.7e+07 1
## Mmus_C57.6J_KDN_GC_Rep2_M34 2 3.7e+07 1
## Mmus_C57.6J_KDN_GC_Rep3_M35 2 4.0e+07 1
## Mmus_C57.6J_KDN_GC_Rep4_M36 2 3.6e+07 1
## Mmus_C57.6J_KDN_GC_Rep5_M37 2 3.8e+07 1
## Mmus_C57.6J_KDN_GC_Rep6_M38 2 3.5e+07 1
Making a design in matrix form.
design<-model.matrix(~group)
Here is what that looks like.
design
## (Intercept) group2
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
## 7 1 1
## 8 1 1
## 9 1 1
## 10 1 1
## 11 1 1
## 12 1 1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
Now we are designing the data by using “estimatedGLMCommonDisp().
dgeGlmComDisp<-estimateGLMCommonDisp(dgeGlm, design, verbose=TRUE)
## Disp = 0.026 , BCV = 0.16
dgeGlmTrendDisp<- estimateGLMTrendedDisp(dgeGlmComDisp, design)
dgeGlmTagDisp<- estimateGLMTagwiseDisp(dgeGlmTrendDisp, design)
Here is the plot of this.
plotBCV
## function (y, xlab = "Average log CPM", ylab = "Biological coefficient of variation",
## pch = 16, cex = 0.2, col.common = "red", col.trend = "blue",
## col.tagwise = "black", ...)
## {
## if (!is(y, "DGEList"))
## stop("y must be a DGEList.")
## A <- y$AveLogCPM
## if (is.null(A))
## A <- aveLogCPM(y$counts, offset = getOffset(y))
## disp <- getDispersion(y)
## if (is.null(disp))
## stop("No dispersions to plot")
## if (attr(disp, "type") == "common")
## disp <- rep_len(disp, length(A))
## plot(A, sqrt(disp), xlab = xlab, ylab = ylab, type = "n",
## ...)
## labels <- cols <- lty <- pt <- NULL
## if (!is.null(y$tagwise.dispersion)) {
## points(A, sqrt(y$tagwise.dispersion), pch = pch, cex = cex,
## col = col.tagwise)
## labels <- c(labels, "Tagwise")
## cols <- c(cols, col.tagwise)
## lty <- c(lty, -1)
## pt <- c(pt, pch)
## }
## if (!is.null(y$common.dispersion)) {
## abline(h = sqrt(y$common.dispersion), col = col.common,
## lwd = 2)
## labels <- c(labels, "Common")
## cols <- c(cols, col.common)
## lty <- c(lty, 1)
## pt <- c(pt, -1)
## }
## if (!is.null(y$trended.dispersion)) {
## o <- order(A)
## lines(A[o], sqrt(y$trended.dispersion)[o], col = col.trend,
## lwd = 2)
## labels <- c(labels, "Trend")
## cols <- c(cols, col.trend)
## lty <- c(lty, 1)
## pt <- c(pt, -1)
## }
## legend("topright", legend = labels, lty = lty, pch = pt,
## pt.cex = cex, lwd = 2, col = cols)
## invisible()
## }
## <bytecode: 0x561101f34158>
## <environment: namespace:edgeR>
Creating a fit and linear model .
fit<-glmFit(dgeGlmTagDisp, design)
colnames(coef(fit))
## [1] "(Intercept)" "group2"
lrt<-glmLRT(fit, coef=2)
topTags(lrt)
## Coefficient: group2
## logFC logCPM LR PValue FDR
## ENSMUSG00000026077 -1.36 3.6 80 4.3e-19 5.9e-15
## ENSMUSG00000007872 0.89 5.5 77 1.9e-18 1.3e-14
## ENSMUSG00000021775 0.95 6.2 63 2.0e-15 9.1e-12
## ENSMUSG00000002250 -0.83 5.3 62 2.7e-15 9.2e-12
## ENSMUSG00000059824 2.26 4.6 58 2.6e-14 7.2e-11
## ENSMUSG00000074715 -1.99 3.8 47 7.0e-12 1.6e-08
## ENSMUSG00000027875 1.72 4.0 38 6.2e-10 8.6e-07
## ENSMUSG00000038393 0.77 9.4 38 6.3e-10 8.6e-07
## ENSMUSG00000020889 0.79 6.3 38 6.3e-10 8.6e-07
## ENSMUSG00000038224 -0.69 5.2 38 6.3e-10 8.6e-07
ttGlm<-topTags(lrt, n=Inf)
This is what the data now displays.
head(ttGlm)
## Coefficient: group2
## logFC logCPM LR PValue FDR
## ENSMUSG00000026077 -1.36 3.6 80 4.3e-19 5.9e-15
## ENSMUSG00000007872 0.89 5.5 77 1.9e-18 1.3e-14
## ENSMUSG00000021775 0.95 6.2 63 2.0e-15 9.1e-12
## ENSMUSG00000002250 -0.83 5.3 62 2.7e-15 9.2e-12
## ENSMUSG00000059824 2.26 4.6 58 2.6e-14 7.2e-11
## ENSMUSG00000074715 -1.99 3.8 47 7.0e-12 1.6e-08
This is the class of ttGlm
class(ttGlm)
## [1] "TopTags"
## attr(,"package")
## [1] "edgeR"
Finding the summary and making variables.
summary(deGlm<- decideTestsDGE(lrt, p=0.05, adjust="fdr"))
## group2
## Down 39
## NotSig 13478
## Up 96
tagsGlm<-rownames(dgeGlmTagDisp)[as.logical(deGlm)]
plotSmear(lrt, de.tags=tagsGlm)
Selecting values below 0.1 for False discovery rate.
hits2<-ttGlm$table[ttGlm$table$FDR<0.1,]
Writing it back to our directory.
write.csv(hits2, "Mouse_EdgeR_Resluts_Zero_vs_1.csv")
Pulling up columns from this data base
columns(org.Mm.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GENETYPE" "GO" "GOALL" "IPI" "MGI"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UNIPROT"
ttGlm2<-as.data.frame(ttGlm$table)
ttGlm2$symbol=mapIds(org.Mm.eg.db,
keys=row.names(ttGlm2),
column="SYMBOL",
keytype="ENSEMBL",
multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
ttGlm2$entrez=mapIds(org.Mm.eg.db,
keys=row.names(ttGlm2),
column="ENTREZID",
keytype="ENSEMBL",
multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
ttGlm2$entrez=mapIds(org.Mm.eg.db,
keys=row.names(ttGlm2),
column="GENENAME",
keytype="ENSEMBL",
multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
Now we have our GENE name, log per million, and P value… etc.
head(ttGlm2)
## logFC logCPM LR PValue FDR symbol
## ENSMUSG00000026077 -1.36 3.6 80 4.3e-19 5.9e-15 Npas2
## ENSMUSG00000007872 0.89 5.5 77 1.9e-18 1.3e-14 Id3
## ENSMUSG00000021775 0.95 6.2 63 2.0e-15 9.1e-12 Nr1d2
## ENSMUSG00000002250 -0.83 5.3 62 2.7e-15 9.2e-12 Ppard
## ENSMUSG00000059824 2.26 4.6 58 2.6e-14 7.2e-11 Dbp
## ENSMUSG00000074715 -1.99 3.8 47 7.0e-12 1.6e-08 Ccl28
## entrez
## ENSMUSG00000026077 neuronal PAS domain protein 2
## ENSMUSG00000007872 inhibitor of DNA binding 3
## ENSMUSG00000021775 nuclear receptor subfamily 1, group D, member 2
## ENSMUSG00000002250 peroxisome proliferator activator receptor delta
## ENSMUSG00000059824 D site albumin promoter binding protein
## ENSMUSG00000074715 chemokine (C-C motif) ligand 28
Now we have to download the following packages:
library(pathview)
library(gage)
library(gageData)
data(kegg.sets.mm)
data(go.subs.mm)
foldchanges<-ttGlm2$logFC
names(foldchanges)<- ttGlm2$entrez
kegg.mm=kegg.gsets("mouse", id.type="entrez")
kegg.mm.sigmet=kegg.mm$kg.sets[kegg.mm$sigmet.idx]
Lets get the results
keggres=gage(foldchanges, gsets=kegg.mm.sigmet, same.dir=TRUE)
lapply(keggres, head)
## $greater
## p.geomean stat.mean p.val q.val
## mmu00970 Aminoacyl-tRNA biosynthesis NA NaN NA NA
## mmu02010 ABC transporters NA NaN NA NA
## mmu03008 Ribosome biogenesis in eukaryotes NA NaN NA NA
## mmu03010 Ribosome NA NaN NA NA
## mmu03013 RNA transport NA NaN NA NA
## mmu03015 mRNA surveillance pathway NA NaN NA NA
## set.size exp1
## mmu00970 Aminoacyl-tRNA biosynthesis 0 NA
## mmu02010 ABC transporters 0 NA
## mmu03008 Ribosome biogenesis in eukaryotes 0 NA
## mmu03010 Ribosome 0 NA
## mmu03013 RNA transport 0 NA
## mmu03015 mRNA surveillance pathway 0 NA
##
## $less
## p.geomean stat.mean p.val q.val
## mmu00970 Aminoacyl-tRNA biosynthesis NA NaN NA NA
## mmu02010 ABC transporters NA NaN NA NA
## mmu03008 Ribosome biogenesis in eukaryotes NA NaN NA NA
## mmu03010 Ribosome NA NaN NA NA
## mmu03013 RNA transport NA NaN NA NA
## mmu03015 mRNA surveillance pathway NA NaN NA NA
## set.size exp1
## mmu00970 Aminoacyl-tRNA biosynthesis 0 NA
## mmu02010 ABC transporters 0 NA
## mmu03008 Ribosome biogenesis in eukaryotes 0 NA
## mmu03010 Ribosome 0 NA
## mmu03013 RNA transport 0 NA
## mmu03015 mRNA surveillance pathway 0 NA
##
## $stats
## stat.mean exp1
## mmu00970 Aminoacyl-tRNA biosynthesis NaN NA
## mmu02010 ABC transporters NaN NA
## mmu03008 Ribosome biogenesis in eukaryotes NaN NA
## mmu03010 Ribosome NaN NA
## mmu03013 RNA transport NaN NA
## mmu03015 mRNA surveillance pathway NaN NA
greaters<- keggres$greater
lessers<-keggres$less
keggrespathways=data.frame(id=rownames(keggres$greater), keggres$greater) %>%
tbl_df() %>%
filter(row_number()<=5)%>%
.$id%>%
as.character()
Here is what our Keggrespathway looks like.
keggrespathways
## [1] "mmu00970 Aminoacyl-tRNA biosynthesis"
## [2] "mmu02010 ABC transporters"
## [3] "mmu03008 Ribosome biogenesis in eukaryotes"
## [4] "mmu03010 Ribosome"
## [5] "mmu03013 RNA transport"
Now we can choose if we want the beginning 8 characters characters or just a select few since all values start with “mmu”.
keggresids_greater=substr(keggrespathways, start=1, stop=8)
Keggresids_greater with 8 characters:
keggresids_greater
## [1] "mmu00970" "mmu02010" "mmu03008" "mmu03010" "mmu03013"
Creating a Plot_pathway variable.
plot_pathway = function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="mouse",
new.signature=FALSE)
Plot multiple pathways.
tmp=sapply(keggresids_greater, function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="mouse"))
## Warning: None of the genes or compounds mapped to the pathway!
## Argument gene.idtype or cpd.idtype may be wrong.
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu00970.pathview.png
## Warning: None of the genes or compounds mapped to the pathway!
## Argument gene.idtype or cpd.idtype may be wrong.
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu02010.pathview.png
## Warning: None of the genes or compounds mapped to the pathway!
## Argument gene.idtype or cpd.idtype may be wrong.
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu03008.pathview.png
## Warning: None of the genes or compounds mapped to the pathway!
## Argument gene.idtype or cpd.idtype may be wrong.
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu03010.pathview.png
## Warning: None of the genes or compounds mapped to the pathway!
## Argument gene.idtype or cpd.idtype may be wrong.
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu03013.pathview.png
Here we are doing the same thing except for the lesser pathway.
keggrespathways=data.frame(id=rownames(keggres$less), keggres$less)%>%
tbl_df() %>%
filter(row_number()<=5)%>%
.$id%>%
as.character()
This is what this pathway layout looks like.
keggrespathways
## [1] "mmu00970 Aminoacyl-tRNA biosynthesis"
## [2] "mmu02010 ABC transporters"
## [3] "mmu03008 Ribosome biogenesis in eukaryotes"
## [4] "mmu03010 Ribosome"
## [5] "mmu03013 RNA transport"
Removing the function/location and only allowing the first 8 characters to show.
keggresids_lesser=substr(keggrespathways, start=1, stop=8)
Here is what that looks like.
keggresids_lesser
## [1] "mmu00970" "mmu02010" "mmu03008" "mmu03010" "mmu03013"
Plotting multiple pathways.
plot_pathway = function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="mouse",
new.signature=FALSE)
tmp=sapply(keggresids_lesser, function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="mouse"))
## Warning: None of the genes or compounds mapped to the pathway!
## Argument gene.idtype or cpd.idtype may be wrong.
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu00970.pathview.png
## Warning: None of the genes or compounds mapped to the pathway!
## Argument gene.idtype or cpd.idtype may be wrong.
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu02010.pathview.png
## Warning: None of the genes or compounds mapped to the pathway!
## Argument gene.idtype or cpd.idtype may be wrong.
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu03008.pathview.png
## Warning: None of the genes or compounds mapped to the pathway!
## Argument gene.idtype or cpd.idtype may be wrong.
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu03010.pathview.png
## Warning: None of the genes or compounds mapped to the pathway!
## Argument gene.idtype or cpd.idtype may be wrong.
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu03013.pathview.png
Now that we have labeled all of our data sets, we have to download a package to turn it into an image.
library(imager)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
##
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
##
## add
## The following object is masked from 'package:Biostrings':
##
## width
## The following object is masked from 'package:XVector':
##
## width
## The following objects are masked from 'package:oligoClasses':
##
## B, B<-
## The following objects are masked from 'package:IRanges':
##
## resize, width
## The following object is masked from 'package:S4Vectors':
##
## width
## The following object is masked from 'package:Biobase':
##
## channel
## The following object is masked from 'package:BiocGenerics':
##
## width
## The following object is masked from 'package:stringr':
##
## boundary
## The following object is masked from 'package:tidyr':
##
## fill
## The following objects are masked from 'package:stats':
##
## convolve, spectrum
## The following object is masked from 'package:graphics':
##
## frame
## The following object is masked from 'package:base':
##
## save.image
We then create a file set to our working directory.
fileneames<-list.files(path= "E:/Bioinformatics/Bisc_450_Scripts/mouse_edgeR", pattern=".*pathview.png")
all_images<-lapply(fileneames, load.image)
Then use the following function to create an image.
knitr::include_graphics(fileneames)
In this section, we are going to be analyzing a mouse experiment by using multiple packages. One is labeled “DESeq2”.
Lets load the required libraries for this analysis.
library("DESeq2")
## Loading required package: GenomicRanges
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
library("dplyr")
install.packages("apeglm")
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
## Warning: package 'apeglm' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
lets load in our data
countData<-read.csv("GLDS-102_rna_seq_Unnormalized_Counts.csv")
colData<-read.csv("PHENO_DATA_Mouse.csv", sep=",", row.names=1)
Now we need to add a leveling factor to the others.
colData$group<-factor(colData$group, levels=c("0", "1"))
Now Lets make sure we have the same number of individuals as well as groups
all(rownames(colData))%in% colnames(countData)
## Warning in all(rownames(colData)): coercing argument of type 'character' to
## logical
## [1] FALSE
We need to round up the counts, because DESeq2 only allows intergers as an input, not fractional numbers. This depends on the method of alignment that was used upstream.
head(countData%>%
mutate_if(is.numeric, ceiling))
## X Mmus_C57_6J_KDN_FLT_Rep1_M23 Mmus_C57_6J_KDN_FLT_Rep2_M24
## 1 ENSMUSG00000000001 2811 2742
## 2 ENSMUSG00000000003 0 0
## 3 ENSMUSG00000000028 59 49
## 4 ENSMUSG00000000031 26 19
## 5 ENSMUSG00000000037 20 26
## 6 ENSMUSG00000000049 0 1
## Mmus_C57_6J_KDN_FLT_Rep3_M25 Mmus_C57_6J_KDN_FLT_Rep4_M26
## 1 3578 3118
## 2 0 0
## 3 102 72
## 4 17 23
## 5 37 30
## 6 5 10
## Mmus_C57_6J_KDN_FLT_Rep5_M27 Mmus_C57_6J_KDN_FLT_Rep6_M28
## 1 3610 3102
## 2 0 0
## 3 80 84
## 4 26 18
## 5 22 20
## 6 0 4
## Mmus_C57_6J_KDN_GC_Rep1_M33 Mmus_C57_6J_KDN_GC_Rep2_M34
## 1 3094 3112
## 2 0 0
## 3 56 75
## 4 15 21
## 5 20 27
## 6 1 4
## Mmus_C57_6J_KDN_GC_Rep3_M35 Mmus_C57_6J_KDN_GC_Rep4_M36
## 1 3195 3158
## 2 0 0
## 3 60 64
## 4 21 25
## 5 8 20
## 6 10 2
## Mmus_C57_6J_KDN_GC_Rep5_M37 Mmus_C57_6J_KDN_GC_Rep6_M38
## 1 3339 2958
## 2 0 0
## 3 56 79
## 4 24 18
## 5 30 36
## 6 1 1
countData[,2:13]<-sapply(countData[,2:13], as.integer)
row.names(countData)<- countData[,1]
countData<-countData[2:13]
rownames(colData)==colnames(countData)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
Designing our data by introducing the function DESeq toward it.
dds<-DESeqDataSetFromMatrix(countData=countData, colData=colData, design= ~group)
dds<-dds[rowSums(counts(dds)>2) >=4]
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res<-results(dds)
We can not do this function due to difficulties downloading the “apeglm” package, but this is what the code chunck would look like for it.
resLFC<-lfcShrink(dds, coef=2)
(resOrdered<-res[order(res$padj), ])
And here is the summary for it.
summary(res)
##
## out of 22008 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 325, 1.5%
## LFC < 0 (down) : 327, 1.5%
## outliers [1] : 15, 0.068%
## low counts [2] : 7247, 33%
## (mean count < 38)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
We can also use our acquired data and foldchange it into a new variable of FLT vs FC.
FLT_Vs_GC<-as.data.frame(res$log2FoldChange)
Here is the top few lines of this.
head(FLT_Vs_GC)
## res$log2FoldChange
## 1 0.0421
## 2 -0.1334
## 3 -0.0185
## 4 -0.0882
## 5 -0.0079
## 6 0.1136
We can also plot it and place it into a pdf, but we can not impliment this because it required the packaged “apeglm.”
plotMA(resLFC, ylim=c(-2, 2))
pdf(file="MA_Plot_FLT_vs_GC.pdf")
dev.off()
Saving differential expression results to files but it requires the previous data that uses “apeglm.”
write.csv(as.data.frame(resOrdered), file="Mouse_DESeq.csv")
Performing a custom transformation.
dds<-estimateSizeFactors(dds)
se<- SummarizedExperiment(log2(counts(dds, normalize=TRUE)+1), colData=colData(dds))
pdf(file="PCA_PLOT_FLT_vs_GC.pdf")
plotPCA(DESeqTransform(se), intgroup="group")
Loading required packages.
library(AnnotationDbi)
library(org.Mm.eg.db)
Labeling columns and applying function.
columns(org.Mm.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GENETYPE" "GO" "GOALL" "IPI" "MGI"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UNIPROT"
foldchanges<-as.data.frame(res$log2FoldChange, row.names=row.names(res))
head(foldchanges)
## res$log2FoldChange
## ENSMUSG00000000001 0.0421
## ENSMUSG00000000028 -0.1334
## ENSMUSG00000000031 -0.0185
## ENSMUSG00000000037 -0.0882
## ENSMUSG00000000049 -0.0079
## ENSMUSG00000000056 0.1136
res$symbol=mapIds(org.Mm.eg.db,
keys=row.names(res),
column="SYMBOL",
keytype="ENSEMBL",
multivals="first")
## 'select()' returned 1:many mapping between keys and columns
res$entrez=mapIds(org.Mm.eg.db,
keys=row.names(res),
column="ENTREZID",
keytype="ENSEMBL",
multivals="first")
## 'select()' returned 1:many mapping between keys and columns
res$name=mapIds(org.Mm.eg.db,
keys=row.names(res),
column="GENENAME",
keytype="ENSEMBL",
multivals="first")
## 'select()' returned 1:many mapping between keys and columns
head(res)
## log2 fold change (MLE): group 1 vs 0
## Wald test p-value: group 1 vs 0
## DataFrame with 6 rows and 9 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000000001 3132.35128 0.04214340 0.0436714 0.9650117 0.334539
## ENSMUSG00000000028 68.75801 -0.13342706 0.1565936 -0.8520597 0.394181
## ENSMUSG00000000031 21.05397 -0.01853142 0.2486477 -0.0745288 0.940590
## ENSMUSG00000000037 24.42314 -0.08817270 0.2982220 -0.2956613 0.767489
## ENSMUSG00000000049 3.24919 -0.00790342 0.9613572 -0.0082211 0.993441
## ENSMUSG00000000056 1424.88216 0.11355979 0.0777635 1.4603234 0.144201
## padj symbol entrez name
## <numeric> <character> <character> <character>
## ENSMUSG00000000001 0.739637 Gnai3 14679 guanine nucleotide b..
## ENSMUSG00000000028 0.777085 Cdc45 12544 cell division cycle 45
## ENSMUSG00000000031 NA H19 14955 H19, imprinted mater..
## ENSMUSG00000000037 NA Scml2 107815 Scm polycomb group p..
## ENSMUSG00000000049 NA Apoh 11818 apolipoprotein H
## ENSMUSG00000000056 0.547527 Narf 67608 nuclear prelamin A r..
Loading pathview packages.
library(pathview)
library(gage)
library(gageData)
data(kegg.sets.mm)
data(go.subs.mm)
foldchanges<-res$log2FoldChange
names(foldchanges)= res$entrez
head(foldchanges)
## 14679 12544 14955 107815 11818 67608
## 0.0421 -0.1334 -0.0185 -0.0882 -0.0079 0.1136
kegg.mm<-kegg.gsets("mouse", id.type ="entrez")
kegg.mm.sigmet<-kegg.mm$kg.sets[kegg.mm$sigmet.idx]
Mapping results
keggres<-gage(foldchanges, gsets=kegg.mm.sigmet, same.dir=TRUE)
lapply(keggres, head)
## $greater
## p.geomean stat.mean p.val q.val
## mmu03010 Ribosome 0.0056 2.6 0.0056 0.9
## mmu04022 cGMP-PKG signaling pathway 0.0309 1.9 0.0309 0.9
## mmu04360 Axon guidance 0.0382 1.8 0.0382 0.9
## mmu04330 Notch signaling pathway 0.0518 1.6 0.0518 0.9
## mmu04658 Th1 and Th2 cell differentiation 0.0538 1.6 0.0538 0.9
## mmu04350 TGF-beta signaling pathway 0.0544 1.6 0.0544 0.9
## set.size exp1
## mmu03010 Ribosome 133 0.0056
## mmu04022 cGMP-PKG signaling pathway 153 0.0309
## mmu04360 Axon guidance 176 0.0382
## mmu04330 Notch signaling pathway 55 0.0518
## mmu04658 Th1 and Th2 cell differentiation 78 0.0538
## mmu04350 TGF-beta signaling pathway 87 0.0544
##
## $less
## p.geomean stat.mean p.val
## mmu04657 IL-17 signaling pathway 0.011 -2.3 0.011
## mmu04110 Cell cycle 0.020 -2.1 0.020
## mmu04145 Phagosome 0.027 -1.9 0.027
## mmu04621 NOD-like receptor signaling pathway 0.042 -1.7 0.042
## mmu04625 C-type lectin receptor signaling pathway 0.044 -1.7 0.044
## mmu04115 p53 signaling pathway 0.048 -1.7 0.048
## q.val set.size exp1
## mmu04657 IL-17 signaling pathway 0.9 75 0.011
## mmu04110 Cell cycle 0.9 121 0.020
## mmu04145 Phagosome 0.9 145 0.027
## mmu04621 NOD-like receptor signaling pathway 0.9 151 0.042
## mmu04625 C-type lectin receptor signaling pathway 0.9 99 0.044
## mmu04115 p53 signaling pathway 0.9 71 0.048
##
## $stats
## stat.mean exp1
## mmu03010 Ribosome 2.6 2.6
## mmu04022 cGMP-PKG signaling pathway 1.9 1.9
## mmu04360 Axon guidance 1.8 1.8
## mmu04330 Notch signaling pathway 1.6 1.6
## mmu04658 Th1 and Th2 cell differentiation 1.6 1.6
## mmu04350 TGF-beta signaling pathway 1.6 1.6
Saving greater and less than pathways.
greaters<-keggres$greater
lessers<-keggres$less
keggrespathways<-data.frame(id=rownames(keggres$greater), keggres$greater)%>%
tbl_df() %>%
filter(row_number()<=3)%>%
.$id%>%
as.character()
keggrespathways
## [1] "mmu03010 Ribosome" "mmu04022 cGMP-PKG signaling pathway"
## [3] "mmu04360 Axon guidance"
keggresids<-substr(keggrespathways, start=1, stop=8)
keggresids
## [1] "mmu03010" "mmu04022" "mmu04360"
Plotting
plot_pathway=function(pid) pathview(gene.data=foldchange, pathway.id=pid, species="mouse", new.signature=FALSE)
tmp=sapply(keggresids, function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="mouse"))
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu03010.pathview.png
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu04022.pathview.png
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu04360.pathview.png
keggrespathways<-data.frame(id=rownames(keggres$less), keggres$less)%>%
tbl_df() %>%
filter(row_number()<=3)%>%
.$id%>%
as.character()
keggrespathways
## [1] "mmu04657 IL-17 signaling pathway" "mmu04110 Cell cycle"
## [3] "mmu04145 Phagosome"
keggresids<-substr(keggrespathways, start=1, stop=8)
keggresids
## [1] "mmu04657" "mmu04110" "mmu04145"
Plotting
plot_pathway=function(pid) pathview(gene.data=foldchange, pathway.id=pid, species="mouse", new.signature=FALSE)
tmp=sapply(keggresids, function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="mouse"))
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu04657.pathview.png
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu04110.pathview.png
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory /home/student/Desktop/classroom/myfiles
## Info: Writing image file mmu04145.pathview.png
library(imager)
filenames<- list.files(path="E:/Bioinformatics/Bisc_450_Scripts/mouse_DEseq", pattern=".*pathview.png")
all_images<-lapply(filenames, load.image)
knitr::include_graphics(filenames)
First lets install tidyverse
library(tidyverse)
Now we can set a working directory into a variable.
EdgeR<-read.csv("Mouse_EdgeR_Resluts_Zero_vs_1.csv")
I could not get this section of code to run… it ran the first time, but would not run after that
Then we can read the csv of "Mouse_DESeq.csv" into our DESeq variable.
DESeq<-read.csv("Mouse_DESeq.csv")
Filtering down to 0.05 significance.
DESeq2<-DESeq%>%
filter(padj<0.05)
Using package called Govenn and the columns need to be filtered first
DESeq2<-DESeq2[,c(1:3)]
EdgeR<-EdgeR[, 1:2]
colnames(DESeq2)<-c("ID","logFC")
colnames(EdgeR)<- c("ID","logFC")
install.packages("GOplot")
Next, we can install GOplot.
library(GOplot)
## Loading required package: ggdendro
##
## Attaching package: 'ggdendro'
## The following object is masked from 'package:imager':
##
## label
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:BiocGenerics':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## Loading required package: RColorBrewer
We can now impliment both of these packages along with GOVenn into a variable.
comp<-GOVenn(DESeq2, EdgeR, label=c("DESeq1","EdgeR"), title="Comparison of DESeq and EdgeR DE Genes", plot=FALSE)
Plotting comp and comparing DESeq and EdgeR DE Genes
comp$plot
The table version of “comp”.
head(comp$table)
Lets dive into multiple protein alignment. First we have to install the package “msa.”
library(msa)
Then we read a string from our data called “hglobin.fa” into a variable.
seq<-readAAStringSet("hglobin.fa")
Here is what that looks like.
seq
## AAStringSet object of length 8:
## width seq names
## [1] 142 MVLSPADKTNVKAAWGKVGAHAG...PAVHASLDKFLASVSTVLTSKYR HBA_HUMAN
## [2] 142 MVLSGEDKSNIKAAWGKIGGHGA...PAVHASLDKFLASVSTVLTSKYR HBA_MOUSE
## [3] 142 MVLSPADKTIVKAAWGKVGAHAG...PAVHASLDKFLASVSTVLTSKYR HBA_Porpoise
## [4] 142 MVLSPADKTNVKAAWGKVGAHAG...PAVHASLDKFLASVSTVLTSKYR HBA_Gorilla
## [5] 142 MVLSPADKTNVKTAWGKVGAHAG...PAVHASLDKFLASVSTVLTSKYR HBA_Capucin
## [6] 142 MTLTQAEKAAVVTIWAKVATQAD...PEVHAAWDKFLSSVSSVLTEKYR HBA_Owl_Parrot
## [7] 141 VLSSGDKANVKSVWSKVQGHLED...PDVHVSLDKFMGTVSTVLTSKYR HBA_Loggerhead
## [8] 142 MVLSDDDKAKVRAAWVPVAKNAE...PSVILALDKFLDLVAKVLVSRYR HBA_Pit_Viper
Lets align the 8 different amino acid sequences
alignments<- msa(seq, method="ClustalW")
## use default substitution matrix
Results:
alignments
## CLUSTAL 2.1
##
## Call:
## msa(seq, method = "ClustalW")
##
## MsaAAMultipleAlignment with 8 rows and 142 columns
## aln names
## [1] MVLSPADKTNVKAAWGKVGAHAGEYG...FTPAVHASLDKFLASVSTVLTSKYR HBA_HUMAN
## [2] MVLSPADKTNVKAAWGKVGAHAGEYG...FTPAVHASLDKFLASVSTVLTSKYR HBA_Gorilla
## [3] MVLSPADKTIVKAAWGKVGAHAGEYG...FTPAVHASLDKFLASVSTVLTSKYR HBA_Porpoise
## [4] MVLSPADKTNVKTAWGKVGAHAGDYG...FTPAVHASLDKFLASVSTVLTSKYR HBA_Capucin
## [5] MVLSGEDKSNIKAAWGKIGGHGAEYG...FTPAVHASLDKFLASVSTVLTSKYR HBA_MOUSE
## [6] MVLSDDDKAKVRAAWVPVAKNAEMYG...LKPSVILALDKFLDLVAKVLVSRYR HBA_Pit_Viper
## [7] -VLSSGDKANVKSVWSKVQGHLEDYG...FTPDVHVSLDKFMGTVSTVLTSKYR HBA_Loggerhead
## [8] MTLTQAEKAAVVTIWAKVATQADAIG...FTPEVHAAWDKFLSSVSSVLTEKYR HBA_Owl_Parrot
## Con MVLSPADKTNVKAAWGKVGAHAGEYG...FTPAVHASLDKFLASVSTVLTSKYR Consensus
Creating a pdf from our data and designing it.
msaPrettyPrint(alignments, output="pdf", showNames="left", showLogo="none",
askForOverwrite= FALSE, verbose=TRUE, file="whole_aligh.pdf")
## Multiple alignment written to temporary file /tmp/Rtmp6YpeKr/seq107e26fbdefb.fasta
## File whole_aligh.tex created
## Warning in texi2dvi(texfile, quiet = !verbose, pdf = identical(output, "pdf"), :
## texi2dvi script/program not available, using emulation
## Output file whole_aligh.pdf created
msaPrettyPrint(alignments, c(10,30), output="pdf", showNames="left",
file="Zoomed_align.pdf", showLogo="top", askForOverwrite=FALSE, verbose=TRUE)
## Multiple alignment written to temporary file /tmp/Rtmp6YpeKr/seq107e13cd4094.fasta
## File Zoomed_align.tex created
## Warning in texi2dvi(texfile, quiet = !verbose, pdf = identical(output, "pdf"), :
## texi2dvi script/program not available, using emulation
## Output file Zoomed_align.pdf created
Lets make a tree from our alignmet but we have to download the following packages:
library(ape)
##
## Attaching package: 'ape'
## The following object is masked from 'package:imager':
##
## where
## The following object is masked from 'package:Biostrings':
##
## complement
library(seqinr)
##
## Attaching package: 'seqinr'
## The following objects are masked from 'package:ape':
##
## as.alignment, consensus
## The following object is masked from 'package:matrixStats':
##
## count
## The following object is masked from 'package:Biostrings':
##
## translate
## The following object is masked from 'package:limma':
##
## zscore
## The following object is masked from 'package:dplyr':
##
## count
Convert to seqinr alignment -> get the distances and make a tree.
alignment_seqinr<- msaConvert(alignments, type="seqinr::alignment")
distances1<- seqinr::dist.alignment(alignment_seqinr, "identity")
Making a Phylogenetic Tree based on HBA Sequences and plotting it.
tree<- ape::nj(distances1)
plot(tree, main="Phylogenetic Tree of HBA Sequences")
Now we get to look at the function of “synteny”. For this we will have to download the package “DECIPHER”.
library(DECIPHER)
## Loading required package: RSQLite
## Loading required package: parallel
In the first step, we load the library and the sequence into Long-seqs. This is a DNAStringSet object.
long_seq<-readDNAStringSet("plastid_genomes.fa")
Here is what this looks like.
long_seq
## DNAStringSet object of length 5:
## width seq names
## [1] 130584 GGCATAAGCTATCTTCCCAAAGG...ATGATTCAAACATAAAAGTCCT Saccharina japoni...
## [2] 161592 ATGGGCGAACGACGGGAATTGAA...AAAGAAAAAAAAATAGGAGTAA Asclepias nivea c...
## [3] 117672 ATGAGTACAACTCGAAAGTCCAC...TTGATTTCATCCACAAACGAAC Nannochloropsis g...
## [4] 154731 TTATCCATTTGTAGATGGAACTT...CATATACACTAAGACAAAAGTC Cocos nucifera ch...
## [5] 156618 GGGCGAACGACGGGAATTGAACC...CCTTTTGTAGCGAATCCGTTAT Camellia cuspidat...
Now lets build a temporary SQLite database.
Seqs2DB(long_seq, "XStringSet", "long_db", names(long_seq))
## Adding 5 sequences to the database.
##
## 5 total sequences in table Seqs.
## Time difference of 0.15 secs
Now that we’ve built the database, we can do the following: Find the syntenic blocks.
synteny<-FindSynteny("long_db")
## ================================================================================
##
## Time difference of 2.3 secs
View blocs w/ plotting.
pairs(synteny)
plot(synteny)
Make an actual alignment file.
alignment<-AlignSynteny(synteny, "long_db")
## ================================================================================
##
## Time difference of 49 secs
Lets create a structure holding all aligned syntenic blocks for a pair of sequences.
block<- unlist(alignment[[1]])
We can write to file one alignment at a time.
writeXStringSet(block, "genome_blocks_out.fa")
Lets download the following libraries:
library(locfit)
## locfit 1.5-9.5 2022-03-01
##
## Attaching package: 'locfit'
## The following object is masked from 'package:purrr':
##
## none
library(Rsamtools)
Lets create a function that will load the gene region information in a GFF. File and convert it to a bioconductor GRanges object
get_annotated_regions_from_gff<-function(file_name){
gff<-rtracklayer::import.gff(file_name)
as(gff,"GRanges")
}
Get count in windows across the genome in 500 bp segments.
whole_genome<-csaw::windowCounts(
file.path("windows.bam"),
bin=TRUE,
filter=0,
width=500,
param=csaw::readParam(
minq=20,#determines what is a passing read
dedup=TRUE, #removes pcr duplicate
pe="both" #requires that both pairs of reads are alighned
)
)
Since this is a single column of data, let’s rename it.
colnames(whole_genome)<-c("small_data")
annotated_regions<-get_annotated_regions_from_gff(file.path("genes.gff"))
Now that we have the windows of high expression, we want to see if any of them overlap with annotated regions. To do this, the packages “IRanges” and “SummarizedExperiment” are needed.
library(IRanges)
library(SummarizedExperiment)
Find the overlaps between the windows we made.
windows_in_genes<- IRanges::overlapsAny(
SummarizedExperiment::rowRanges(whole_genome), #creates a Granges object
annotated_regions
)
This is the results:
windows_in_genes
## [1] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE
Here we subset the whole_genome object into annotated and nonannotated regions.
annotated_window_counts<- whole_genome[windows_in_genes,]
non_annotated_window_counts<- whole_genome[!windows_in_genes,]
Using assay() to extract the actual counts from the Granges object.
assay(non_annotated_window_counts)
## small_data
## [1,] 0
## [2,] 0
## [3,] 24
## [4,] 25
## [5,] 0
## [6,] 0
In this step, we use a Rsamtools Puleup() function to get a per-base converge dataframe. Each row represents a single nucleotide in the reference count and the count column gives the depth of coverate at that point. First download “bumphunter” for this.
library(bumphunter)
## Loading required package: foreach
## Parallel computing support for 'oligo/crlmm': Disabled
## - Load 'ff'
## - Load and register a 'foreach' adaptor
## Example - Using 'multicore' for 2 cores:
## library(doMC)
## registerDoMC(2)
## ================================================================================
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loading required package: iterators
pile_df<- Rsamtools::pileup(file.path("windows.bam"))
This step groups the reads with certain distances of each other into a cluster. We give the sequence names, position, and distance.
clusters<- bumphunter::clusterMaker(pile_df$seqnames, pile_df$pos, maxGap= 100)
And now view the table form.
table(clusters)
## clusters
## 1 2 3
## 1486 1552 1520
In this step, we will map the reads to the regions we found for the genome.
bumphunter::regionFinder(pile_df$count, pile_df$seqnames, pile_df$pos, clusters, cutoff=1)
## getSegments: segmenting
## getSegments: splitting
## chr start end value area cluster indexStart indexEnd L clusterL
## 3 Chr1 4503 5500 10.4 15811 3 3039 4558 1520 1520
## 1 Chr1 502 1500 10.0 14839 1 1 1486 1486 1486
## 2 Chr1 2501 3500 8.7 13436 2 1487 3038 1552 1552
Lets load the required packages.
library(ggplot2)
library(ggtree)
## ggtree v3.2.1 For help: https://yulab-smu.top/treedata-book/
##
## If you use ggtree in published research, please cite the most appropriate paper(s):
##
## 1. Guangchuang Yu. Using ggtree to visualize data on tree-like structures. Current Protocols in Bioinformatics. 2020, 69:e96. doi:10.1002/cpbi.96
## 2. Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution. 2018, 35(12):3041-3043. doi:10.1093/molbev/msy194
## 3. Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
##
## Attaching package: 'ggtree'
## The following object is masked from 'package:ape':
##
## rotate
## The following object is masked from 'package:magrittr':
##
## inset
## The following object is masked from 'package:reshape':
##
## expand
## The following object is masked from 'package:Biostrings':
##
## collapse
## The following object is masked from 'package:IRanges':
##
## collapse
## The following object is masked from 'package:S4Vectors':
##
## expand
## The following object is masked from 'package:tidyr':
##
## expand
library(treeio)
## treeio v1.18.1 For help: https://yulab-smu.top/treedata-book/
##
## If you use treeio in published research, please cite:
##
## LG Wang, TTY Lam, S Xu, Z Dai, L Zhou, T Feng, P Guo, CW Dunn, BR Jones, T Bradley, H Zhu, Y Guan, Y Jiang, G Yu. treeio: an R package for phylogenetic tree input and output with richly annotated and associated data. Molecular Biology and Evolution 2020, 37(2):599-603. doi: 10.1093/molbev/msz240
##
## Attaching package: 'treeio'
## The following object is masked from 'package:seqinr':
##
## read.fasta
## The following object is masked from 'package:ape':
##
## drop.tip
## The following object is masked from 'package:Biostrings':
##
## mask
First we need to load our raw tree data. Its a Newick format to use:
itol<-ape::read.tree("itol.nwk")
Now we will print out a very bacic phylogenitic tree.
ggtree(itol)
We can also change the format to make it a circular tree.
ggtree(itol, layout="circular")
Also change the left=right/up=down direction.
ggtree(itol) + coord_flip() +scale_x_reverse()
By using geom_tipla() we can add names to the end of tips by adding a goem_strip().
ggtree(itol) + geom_tiplab(color="indianred", size=0.5)
We can annotate clades in the tree with a block of color.
ggtree(itol, layout="unrooted") +geom_strip(13,14, color="red", barsize=1)
## "daylight" method was used as default layout for unrooted tree.
## Average angle change [1] 0.190413138751168
## Average angle change [2] 0.144336349355351
## Average angle change [3] 0.123242001597025
## Average angle change [4] 0.104391847943619
## Average angle change [5] 0.0800193428708038
We can highlight clades in unrooted trees with blobs of color using geom_hilight.
ggtree(itol, layout="unrooted") + geom_hilight(node=11, type="encircle", fill="steelblue")
## "daylight" method was used as default layout for unrooted tree.
## Average angle change [1] 0.190413138751168
## Average angle change [2] 0.144336349355351
## Average angle change [3] 0.123242001597025
## Average angle change [4] 0.104391847943619
## Average angle change [5] 0.0800193428708038
Next we can install the following packages:
install.packages('BAMMtools')
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(BAMMtools)
We can use the MRCA( most recent common ancestor) function to find the node we want.
getmrca(itol, "Photorhabdus_luminescens", "Blochmannia_floridanus")
## [1] 206
Now if we want to higlight the section of the most recent common ancestor between the two above:
ggtree(itol, layout="unrooted")+geom_hilight(node=206, type="encircle", fill="steelblue")
## "daylight" method was used as default layout for unrooted tree.
## Average angle change [1] 0.190413138751168
## Average angle change [2] 0.144336349355351
## Average angle change [3] 0.123242001597025
## Average angle change [4] 0.104391847943619
## Average angle change [5] 0.0800193428708038
Quantifying differences between trees with treespace. First we need 3 packages:
library(ape)
library(adegraphics)
## Registered S3 methods overwritten by 'adegraphics':
## method from
## biplot.dudi ade4
## kplot.foucart ade4
## kplot.mcoa ade4
## kplot.mfa ade4
## kplot.pta ade4
## kplot.sepan ade4
## kplot.statis ade4
## scatter.coa ade4
## scatter.dudi ade4
## scatter.nipals ade4
## scatter.pco ade4
## score.acm ade4
## score.mix ade4
## score.pca ade4
## screeplot.dudi ade4
##
## Attaching package: 'adegraphics'
## The following object is masked from 'package:ape':
##
## zoom
## The following object is masked from 'package:GenomicRanges':
##
## score
## The following object is masked from 'package:Biostrings':
##
## score
## The following object is masked from 'package:BiocGenerics':
##
## score
library(treespace)
## Loading required package: ade4
##
## Attaching package: 'ade4'
## The following objects are masked from 'package:adegraphics':
##
## kplotsepan.coa, s.arrow, s.class, s.corcircle, s.distri, s.image,
## s.label, s.logo, s.match, s.traject, s.value, table.value,
## triangle.class
## The following object is masked from 'package:GenomicRanges':
##
## score
## The following object is masked from 'package:Biostrings':
##
## score
## The following object is masked from 'package:BiocGenerics':
##
## score
## Creating a generic function for 'toJSON' from package 'jsonlite' in package 'googleVis'
Now we need to load all the treefiles into a multiPhylo object.
treefiles<- list.files(file.path(getwd(), "gene_trees"), full.names = TRUE)
tree_list<-lapply(treefiles, read.tree)
class(tree_list)<- "multiPhylo"
class(tree_list)
## [1] "multiPhylo"
Now we can compute the kendall-coljin distances between trees. This function does a LOT of analysis.
First it runs a pairwise comparison of all tress in the input and secondly it carries out clustering using PCA. These results are returned in a list of objects, where “$D contains the pairwise metric of the trees, and $pcL” contains the PCA. The method we use (Kendal-Coljin) is particularly suitable for rooted trees as we have here. The option NF tells us many principal compnents to retian.
Due to a coding problem, it suggests The function treeDist is suitable for comparing two trees even though the suggested number of trees is 3. So I simply set eval=FALSE for the HTML file.
comparisons <- treespace(tree_list, nf=3)
We can plot the pairwise distances between trees with table. image table.image(comparisons$D, nclass=25).df
Now lets print the PCA and clusters, this shows us how the group of trees cluster.
plotGroves(comparisons$pco, lab.show = TRUE, lab.cex=1.5)
Plotting
groves<-findGroves(comparisons, nclust = 4)
plotGroves(groves)
Extracting and working w/ sub-trees using APE.
First load our required packages:
library(ape)
Now lets load the tree data we will be working with.
There seems to be problems w/ the directory even when it is set to the correct file. In regular R-script, everything works fine but only when trasfering it to R Markdown is when problems occur.
newick<-read.tree(file.path(getwd(), "mammal_tree.nwk.txt"))
l<- subtrees(newick)
Lets plot the tree to see what it looks like.
plot(newick)
We can subset tis plot using the “node” function
plot(l[[4]],sub="Node 4")
Extract the tree manually:
small_tree<-extract.clade(newick,9)
plot(small_tree)
Now what if we want to bind two trees together.
new_tree<- bind.tree(newick, small_tree,3)
plot(new_tree)
First we must download the following packages:
library(Biostrings)
library(msa)
library(phangorn)
Then we’ll load the sequences into a seqs variable. Due to similar issues since starting
seqs<-readAAStringSet("abc.fa")
Now, lets construct an alignment with the msa package and ClustalOmega.
aln<-msa::msa(seqs, method=c("ClustalOmega"))
ETo create a tree, we need to convert the alignment to phyData objects:
aln<-as.phyDat(aln, type="AA")
and here is the class for “aln”:
class(aln)
In this step, we’ll actually make the trees. Trees are made from a distance matrix, which can be computed with dist.ml()-ML stands for maximum likelyhood.
dist_mat<- dist.ml(aln)
Now we pass the distance matrix to upgma to make a UPGMA tree and plot it:
upgma_tree<-upgma(dist_mat)
plot(upgma_tree,main="UPGMA")
We can conversly pass the distance matrix to a neighbor joining function and plot it.
nj_tree<-NJ(dist_mat)
plot(nj_tree,main="NJ")
First lets load the required libraries
library(GenomicRanges)
library(gmapR)
library(rtracklayer)
library(VariantAnnotation)
##
## Attaching package: 'VariantAnnotation'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:base':
##
## tabulate
library(VariantTools)
Now we want to load our datasets, we need .Bam and .fa files for this pipeline to work:
bam_file<-file.path(getwd(),"hg17_snps.bam")
fasta_file<-file.path(getwd(),"chr17.83k.fa")
Now we need to set up the genome object and the parameters object:
fa<-rtracklayer::FastaFile(fasta_file)
Now we create a GMapGenome object, which describes the genome to the later variant calling function:
genome <- gmapR::GmapGenome(fa, create=TRUE)
## Creating directory /home/student/.local/share/gmap
This next step sets our parameter for what is considered a variant. There can be a lot of fine=tuning to this function. We are just going to use the standard settings
qual_params<-TallyVariantsParam(
genome=genome,
minimum_mapq=20)
var_params<-VariantCallingFilters(read.count=19, p.lower=0.01)
Now we use callVariants function in accordance with our parameters we defined as.
called_variants<-callVariants(bam_file,
qual_params,
calling.filter=var_params)
Here are the first few lines:
head(called_variants)
## VRanges object with 6 ranges and 17 metadata columns:
## seqnames ranges strand ref alt totalDepth
## <Rle> <IRanges> <Rle> <character> <characterOrRle> <integerOrRle>
## [1] NC_000017.10 64 * G T 759
## [2] NC_000017.10 69 * G T 812
## [3] NC_000017.10 70 * G T 818
## [4] NC_000017.10 73 * T A 814
## [5] NC_000017.10 77 * T A 802
## [6] NC_000017.10 78 * G T 798
## refDepth altDepth sampleNames softFilterMatrix | n.read.pos
## <integerOrRle> <integerOrRle> <factorOrRle> <matrix> | <integer>
## [1] 739 20 <NA> | 17
## [2] 790 22 <NA> | 19
## [3] 796 22 <NA> | 20
## [4] 795 19 <NA> | 13
## [5] 780 22 <NA> | 19
## [6] 777 21 <NA> | 17
## n.read.pos.ref raw.count.total count.plus count.plus.ref count.minus
## <integer> <integer> <integer> <integer> <integer>
## [1] 64 759 20 739 0
## [2] 69 812 22 790 0
## [3] 70 818 22 796 0
## [4] 70 814 19 795 0
## [5] 70 802 22 780 0
## [6] 70 798 21 777 0
## count.minus.ref count.del.plus count.del.minus read.pos.mean
## <integer> <integer> <integer> <numeric>
## [1] 0 0 0 30.9000
## [2] 0 0 0 40.7273
## [3] 0 0 0 34.7727
## [4] 0 0 0 36.1579
## [5] 0 0 0 38.3636
## [6] 0 0 0 39.7143
## read.pos.mean.ref read.pos.var read.pos.var.ref mdfne mdfne.ref
## <numeric> <numeric> <numeric> <numeric> <numeric>
## [1] 32.8755 318.558 347.804 NA NA
## [2] 35.4190 377.004 398.876 NA NA
## [3] 36.3442 497.762 402.360 NA NA
## [4] 36.2176 519.551 402.843 NA NA
## [5] 36.0064 472.327 397.070 NA NA
## [6] 35.9241 609.076 390.463 NA NA
## count.high.nm count.high.nm.ref
## <integer> <integer>
## [1] 20 738
## [2] 22 789
## [3] 22 796
## [4] 19 769
## [5] 22 780
## [6] 21 777
## -------
## seqinfo: 1 sequence from chr17.83k genome
## hardFilters(4): nonRef nonNRef readCount likelihoodRatio
Now we have identified 6 variants:
Now, we move on to annotation and load the feature position information from gff:
get_annotated_regions_from_gff<-function(file_name){
gff<-rtracklayer::import.gff(file_name)
as(gff,"GRanged")
}
Note you can also load this data rom a bed file.
genes<- get_annotated_regions_from_gff("chr17.83k.gff3")
overlaps<-GenomicRanges::findOverlaps("chr17.83k.gff3")
Here are the results:
overlaps
An lastly we subset the genes with the list of overlaps:
identified<- genes[subjectHits(overlaps)]
First lets load the required libraries
library(GenomicRanges)
library(gmapR)
library(rtracklayer)
library(VariantAnnotation)
library(VariantTools)
Now we want to load our datasets, we need .Bam and .fa files for this pipeline to work:
bam_file<-file.path(getwd(),"hg17_snps.bam")
fasta_file<-file.path(getwd(),"chr17.83k.fa")
Now we need to set up the genome object and the parameters object:
fa<-rtracklayer::FastaFile(fasta_file)
Now we create a GMapGenome object, which describes the genome to the later variant calling function:
genome <- gmapR::GmapGenome(fa, create=TRUE)
## NOTE: genome 'chr17.83k' already exists, not overwriting
This next step sets our parameter for what is considered a variant. There can be a lot of fine=tuning to this function. We are just going to use the standard settings
qual_params<-TallyVariantsParam(
genome=genome,
minimum_mapq=20)
var_params<-VariantCallingFilters(read.count=19, p.lower=0.01)
Now we use callVariants function in accordance with our parameters we defined as.
called_variants<-callVariants(bam_file,
qual_params,
calling.filter=var_params)
Here are the first few lines:
head(called_variants)
## VRanges object with 6 ranges and 17 metadata columns:
## seqnames ranges strand ref alt totalDepth
## <Rle> <IRanges> <Rle> <character> <characterOrRle> <integerOrRle>
## [1] NC_000017.10 64 * G T 759
## [2] NC_000017.10 69 * G T 812
## [3] NC_000017.10 70 * G T 818
## [4] NC_000017.10 73 * T A 814
## [5] NC_000017.10 77 * T A 802
## [6] NC_000017.10 78 * G T 798
## refDepth altDepth sampleNames softFilterMatrix | n.read.pos
## <integerOrRle> <integerOrRle> <factorOrRle> <matrix> | <integer>
## [1] 739 20 <NA> | 17
## [2] 790 22 <NA> | 19
## [3] 796 22 <NA> | 20
## [4] 795 19 <NA> | 13
## [5] 780 22 <NA> | 19
## [6] 777 21 <NA> | 17
## n.read.pos.ref raw.count.total count.plus count.plus.ref count.minus
## <integer> <integer> <integer> <integer> <integer>
## [1] 64 759 20 739 0
## [2] 69 812 22 790 0
## [3] 70 818 22 796 0
## [4] 70 814 19 795 0
## [5] 70 802 22 780 0
## [6] 70 798 21 777 0
## count.minus.ref count.del.plus count.del.minus read.pos.mean
## <integer> <integer> <integer> <numeric>
## [1] 0 0 0 30.9000
## [2] 0 0 0 40.7273
## [3] 0 0 0 34.7727
## [4] 0 0 0 36.1579
## [5] 0 0 0 38.3636
## [6] 0 0 0 39.7143
## read.pos.mean.ref read.pos.var read.pos.var.ref mdfne mdfne.ref
## <numeric> <numeric> <numeric> <numeric> <numeric>
## [1] 32.8755 318.558 347.804 NA NA
## [2] 35.4190 377.004 398.876 NA NA
## [3] 36.3442 497.762 402.360 NA NA
## [4] 36.2176 519.551 402.843 NA NA
## [5] 36.0064 472.327 397.070 NA NA
## [6] 35.9241 609.076 390.463 NA NA
## count.high.nm count.high.nm.ref
## <integer> <integer>
## [1] 20 738
## [2] 22 789
## [3] 22 796
## [4] 19 769
## [5] 22 780
## [6] 21 777
## -------
## seqinfo: 1 sequence from chr17.83k genome
## hardFilters(4): nonRef nonNRef readCount likelihoodRatio
Now we have identified 6 variants:
Now, we move on to annotation and load the feature position information from gff:
get_annotated_regions_from_gff<-function(file_name){
gff<-rtracklayer::import.gff(file_name)
as(gff,"GRanged")
}
Note you can also load this data rom a bed file.
genes<- get_annotated_regions_from_gff("chr17.83k.gff3")
overlaps<-GenomicRanges::findOverlaps("chr17.83k.gff3")
Here are the results:
overlaps
An lastly we subset the genes with the list of overlaps:
identified<- genes[subjectHits(overlaps)]
First thing, lets load the required packages:
library(Biostrings)
library(systemPipeR)
## Loading required package: ShortRead
## Loading required package: BiocParallel
## Loading required package: GenomicAlignments
##
## Attaching package: 'GenomicAlignments'
## The following object is masked from 'package:dplyr':
##
## last
##
## Attaching package: 'ShortRead'
## The following object is masked from 'package:adegraphics':
##
## zoom
## The following objects are masked from 'package:locfit':
##
## left, right
## The following object is masked from 'package:ape':
##
## zoom
## The following object is masked from 'package:imager':
##
## clean
## The following object is masked from 'package:magrittr':
##
## functions
## The following object is masked from 'package:oligo':
##
## intensity
## The following objects are masked from 'package:oligoClasses':
##
## chromosome, position
## The following object is masked from 'package:affy':
##
## intensity
## The following object is masked from 'package:dplyr':
##
## id
## The following object is masked from 'package:purrr':
##
## compose
## The following object is masked from 'package:tibble':
##
## view
##
## Attaching package: 'systemPipeR'
## The following object is masked from 'package:VariantAnnotation':
##
## reference
## The following object is masked from 'package:DESeq2':
##
## results
Lets load the data into a DNAStrings object, in this case, an Arabidopsis chloroplast genome
dna_object<-readDNAStringSet("arabidopsis_chloroplast.fa.txt")
Now lets predict the open reading frames with predORF(), we’ll predict all ORF on both strands.
predict_orfs<-predORF(dna_object, n='all', type='gr', mode='ORF', strand= 'both',
longest_disjoint=TRUE)
This printed out a GRanges object in return, with 2,501 open reading frames this is FAR too many open reading frames.
To filter out erroneous ORFs, we do a simulation. We first estimate the length an ORF can reach by chance. We will create a string of random nucleotides that is the length of our chloroplast genome and determine the longer ORF that can arise by chance.
bases<-c("A","T","G","C")
raw_seq_string<- strsplit(as.character(dna_object), "")
Now we need to ensure that our random nucletides match the proportion of nucleodtides in our chloroplast genome so we have no bias.
seq_length<-width(dna_object[1])
counts<- lapply(bases,function(x) {sum(grepl(x, raw_seq_string))})
probs<-unlist(lapply(counts, function(base_count){signif(base_count/seq_length, 2)}))
Here is what it looks like:
probs
## [1] 6.5e-06 6.5e-06 6.5e-06 6.5e-06
Now we can build our function to simulate a genome.
get_longest_orf_in_random_genome<-function(x,
length=1000,
probs=c(0.25, 0.25,0.25,0.25 ),
bases=c("A","T","G","C")){
random_genome<-paste0(sample(bases, size=length, replace=TRUE, prob=probs), collapse="")
random_dna_object<-DNAStringSet(random_genome)
names(random_dna_object)<-c("random_dna_string")
orfs<-predORF(random_dna_object, n=1, type='gr', mode='ORF', strand='both', longest_disjoint=TRUE)
return(max(width(orfs)))
}
NOW WE USE THE FUNCTION WE JUST CREATED TO PREDICT THE orfS IN 10 RANDOM GENOMES.
random_lengths<-unlist(lapply(1:10, get_longest_orf_in_random_genome, length=seq_length, probs=probs, bases=bases))
Lets pull out the longest length from our 10 simulations:
longest_random_orf<-max(random_lengths)
Lets only keep the frames that are longer in our chloroplast genome:
keep<- width(predict_orfs)> longest_random_orf
orfs_to_keep<-predict_orfs[keep]
This is what it looks like:
orfs_to_keep
## GRanges object with 9 ranges and 2 metadata columns:
## seqnames ranges strand | subject_id inframe2end
## <Rle> <IRanges> <Rle> | <integer> <numeric>
## 1 chloroplast 86762-93358 + | 1 2
## 2 chloroplast 72371-73897 + | 2 2
## 3 chloroplast 54937-56397 + | 3 3
## 4 chloroplast 57147-58541 + | 4 1
## 5 chloroplast 33918-35141 + | 5 1
## 6 chloroplast 32693-33772 + | 6 2
## 7 chloroplast 109408-110436 + | 7 3
## 8 chloroplast 114461-115447 + | 8 2
## 9 chloroplast 141539-142276 + | 9 2
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
Write this data to file
extracted_orfs<- BSgenome::getSeq(dna_object, orfs_to_keep)
names(extracted_orfs)<- paste0("orf_", 1:length(orfs_to_keep))
writeXStringSet(extracted_orfs, "saved_orfs.fa")
First lets load the required packages:
library(karyoploteR)
## Loading required package: regioneR
library(GenomicRanges)
Now we wneed to set up the genome object for our Karyotype:
genome_df<-data.frame(
#First we dictate the number of chromosomes
chr=paste0("chr", 1:5),
start=rep(1,5),
#and then we will dictate the length of each chromosome
end=c(34964571, 22037565, 25499034, 20862711, 31270811)
)
Now we convert the dataframe to a granges obect:
genome_gr<-makeGRangesFromDataFrame(genome_df)
Now lets create some snp positions to map out. We do this by using the sample() function.
snp_pos<-sample(1:1e7, 25)
snps<- data.frame(
chr=paste0("chr", sample(1:5, 25, replace=TRUE)),
start=snp_pos,
end=snp_pos
)
Again we convert the dataframe to granges:
snps_gr<-makeGRangesFromDataFrame(snps)
Now lets create some snp labels:
snp_labels<-paste0("snp_", 1:25)
Here we will set the margins for our plot:
plot.params<-getDefaultPlotParams(plot.type=1)
Here we will set the margins of our plot:
plot.params$data1outmargin<- 600
Now lets plot our snps:
kp<-plotKaryotype(genome=genome_gr, plot.type=1, plot.params=plot.params)
kpPlotMarkers(kp, snps_gr, labels=snp_labels)
We can also add some numeric data to our plots. We will generate 100 random numbers that plot to 100 windows on chromosome 4.
numeric_data<-data.frame(
y=rnorm(100, mean=1, sd=0.5),
chr=rep("chr4", 100),
start=seq(1, 20862711, 20862711/100),
end=seq(1, 20682711, 20862711/100)
)
Now lets make the data a granges object:
numeric_data_gr<- makeGRangesFromDataFrame(numeric_data)
Again lets set our plot parameters:
plot.params<-getDefaultPlotParams(plot.type=2)
plot.params$data1outmargin<-800
plot.params$data2outmargin<-800
plot.params$topmargin<-800
Lets plot the data:
kp<-plotKaryotype(genome = genome_gr, plot.type=1, plot.params=plot.params)
kpPlotMarkers(kp, snps_gr, labels=snp_labels)
kpLines(kp, numeric_data_gr, y=numeric_data$y, data.panel=2)