Анисимова Полина

Данные от П. Анисимовой измерения массы муравьев (мг) в желудках ящериц в зависимости от времени года:

library(readxl)
## Warning: пакет 'readxl' был собран под R версии 4.2.3
DataSet1 <- read_excel("Data/Анисимова данные.xlsx")
summary(DataSet1)
##     month              biomass       
##  Length:24          Min.   :   0.00  
##  Class :character   1st Qu.:  16.75  
##  Mode  :character   Median :  51.00  
##                     Mean   : 201.04  
##                     3rd Qu.: 235.25  
##                     Max.   :1889.00

Чуть добавим анализа

mean(DataSet1$biomass)
## [1] 201.0417
sd(DataSet1$biomass)
## [1] 399.6585
min(DataSet1$biomass)
## [1] 0
max(DataSet1$biomass)
## [1] 1889
quantile(DataSet1$biomass)
##      0%     25%     50%     75%    100% 
##    0.00   16.75   51.00  235.25 1889.00
quantile(DataSet1$biomass, probs = c(10, 90)/100)
##   10%   90% 
##   5.3 506.9

Построим гистограмму распредеелния

hist(DataSet1$biomass, breaks = 20)

#install.packages("gtsummary")
library(gtsummary)
## Warning: пакет 'gtsummary' был собран под R версии 4.2.3
library(tidyverse)
## Warning: пакет 'tidyverse' был собран под R версии 4.2.3
## Warning: пакет 'ggplot2' был собран под R версии 4.2.3
## Warning: пакет 'tibble' был собран под R версии 4.2.3
## Warning: пакет 'readr' был собран под R версии 4.2.3
## Warning: пакет 'purrr' был собран под R версии 4.2.3
## Warning: пакет 'dplyr' был собран под R версии 4.2.3
## Warning: пакет 'lubridate' был собран под R версии 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
DataSet1 %>% filter(month == "June")
## # A tibble: 3 × 2
##   month biomass
##   <chr>   <dbl>
## 1 June       13
## 2 June      242
## 3 June      105
tbl1 <- DataSet1 %>% tbl_summary()
tbl1
Characteristic N = 241
month
    August 10 (42%)
    July 5 (21%)
    June 3 (13%)
    September 6 (25%)
biomass 51 (17, 235)
1 n (%); Median (IQR)
tbl1 %>% as_flex_table() %>% 
  flextable::save_as_docx( path = "tbl1_tbl.docx") 
tbl1_2 <- DataSet1 %>% tbl_summary(by = month)
tbl1_2
Characteristic August, N = 101 July, N = 51 June, N = 31 September, N = 61
biomass 161 (60, 508) 20 (8, 59) 105 (59, 174) 12 (5, 20)
1 Median (IQR)

Шелепин Никита

Данные по осадкам в Австралии в зависимости от места - времени.

library(readr)
DataSet2 <- read_csv("Data/Шелепин.csv")
## Rows: 145460 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (6): Location, WindGustDir, WindDir9am, WindDir3pm, RainToday, RainTom...
## dbl  (16): MinTemp, MaxTemp, Rainfall, Evaporation, Sunshine, WindGustSpeed,...
## date  (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#summary(DataSet2)

Выберем из загруженных данных интересующие нас

names(DataSet2)
##  [1] "Date"          "Location"      "MinTemp"       "MaxTemp"      
##  [5] "Rainfall"      "Evaporation"   "Sunshine"      "WindGustDir"  
##  [9] "WindGustSpeed" "WindDir9am"    "WindDir3pm"    "WindSpeed9am" 
## [13] "WindSpeed3pm"  "Humidity9am"   "Humidity3pm"   "Pressure9am"  
## [17] "Pressure3pm"   "Cloud9am"      "Cloud3pm"      "Temp9am"      
## [21] "Temp3pm"       "RainToday"     "RainTomorrow"
DataSet2 <- DataSet2 %>% select("Location", "MinTemp", "MaxTemp", "Rainfall")
summary(DataSet2)
##    Location            MinTemp         MaxTemp         Rainfall      
##  Length:145460      Min.   :-8.50   Min.   :-4.80   Min.   :  0.000  
##  Class :character   1st Qu.: 7.60   1st Qu.:17.90   1st Qu.:  0.000  
##  Mode  :character   Median :12.00   Median :22.60   Median :  0.000  
##                     Mean   :12.19   Mean   :23.22   Mean   :  2.361  
##                     3rd Qu.:16.90   3rd Qu.:28.20   3rd Qu.:  0.800  
##                     Max.   :33.90   Max.   :48.10   Max.   :371.000  
##                     NA's   :1485    NA's   :1261    NA's   :3261

Сделаем описательную статистику для выбранных данных

tbl2 <- DataSet2 %>% tbl_summary() #описательная статистика
tbl2
Characteristic N = 145,4601
Location
    Adelaide 3,193 (2.2%)
    Albany 3,040 (2.1%)
    Albury 3,040 (2.1%)
    AliceSprings 3,040 (2.1%)
    BadgerysCreek 3,009 (2.1%)
    Ballarat 3,040 (2.1%)
    Bendigo 3,040 (2.1%)
    Brisbane 3,193 (2.2%)
    Cairns 3,040 (2.1%)
    Canberra 3,436 (2.4%)
    Cobar 3,009 (2.1%)
    CoffsHarbour 3,009 (2.1%)
    Dartmoor 3,009 (2.1%)
    Darwin 3,193 (2.2%)
    GoldCoast 3,040 (2.1%)
    Hobart 3,193 (2.2%)
    Katherine 1,578 (1.1%)
    Launceston 3,040 (2.1%)
    Melbourne 3,193 (2.2%)
    MelbourneAirport 3,009 (2.1%)
    Mildura 3,009 (2.1%)
    Moree 3,009 (2.1%)
    MountGambier 3,040 (2.1%)
    MountGinini 3,040 (2.1%)
    Newcastle 3,039 (2.1%)
    Nhil 1,578 (1.1%)
    NorahHead 3,004 (2.1%)
    NorfolkIsland 3,009 (2.1%)
    Nuriootpa 3,009 (2.1%)
    PearceRAAF 3,009 (2.1%)
    Penrith 3,039 (2.1%)
    Perth 3,193 (2.2%)
    PerthAirport 3,009 (2.1%)
    Portland 3,009 (2.1%)
    Richmond 3,009 (2.1%)
    Sale 3,009 (2.1%)
    SalmonGums 3,001 (2.1%)
    Sydney 3,344 (2.3%)
    SydneyAirport 3,009 (2.1%)
    Townsville 3,040 (2.1%)
    Tuggeranong 3,039 (2.1%)
    Uluru 1,578 (1.1%)
    WaggaWagga 3,009 (2.1%)
    Walpole 3,006 (2.1%)
    Watsonia 3,009 (2.1%)
    Williamtown 3,009 (2.1%)
    Witchcliffe 3,009 (2.1%)
    Wollongong 3,040 (2.1%)
    Woomera 3,009 (2.1%)
MinTemp 12 (8, 17)
    Unknown 1,485
MaxTemp 23 (18, 28)
    Unknown 1,261
Rainfall 0.0 (0.0, 0.8)
    Unknown 3,261
1 n (%); Median (IQR)

Сделаем сравнение между штатами

tbl2 <- DataSet2 %>% tbl_summary(by = Location) #описательная статистика + разбивка по Location
tbl2
Characteristic Adelaide, N = 3,1931 Albany, N = 3,0401 Albury, N = 3,0401 AliceSprings, N = 3,0401 BadgerysCreek, N = 3,0091 Ballarat, N = 3,0401 Bendigo, N = 3,0401 Brisbane, N = 3,1931 Cairns, N = 3,0401 Canberra, N = 3,4361 Cobar, N = 3,0091 CoffsHarbour, N = 3,0091 Dartmoor, N = 3,0091 Darwin, N = 3,1931 GoldCoast, N = 3,0401 Hobart, N = 3,1931 Katherine, N = 1,5781 Launceston, N = 3,0401 Melbourne, N = 3,1931 MelbourneAirport, N = 3,0091 Mildura, N = 3,0091 Moree, N = 3,0091 MountGambier, N = 3,0401 MountGinini, N = 3,0401 Newcastle, N = 3,0391 Nhil, N = 1,5781 NorahHead, N = 3,0041 NorfolkIsland, N = 3,0091 Nuriootpa, N = 3,0091 PearceRAAF, N = 3,0091 Penrith, N = 3,0391 Perth, N = 3,1931 PerthAirport, N = 3,0091 Portland, N = 3,0091 Richmond, N = 3,0091 Sale, N = 3,0091 SalmonGums, N = 3,0011 Sydney, N = 3,3441 SydneyAirport, N = 3,0091 Townsville, N = 3,0401 Tuggeranong, N = 3,0391 Uluru, N = 1,5781 WaggaWagga, N = 3,0091 Walpole, N = 3,0061 Watsonia, N = 3,0091 Williamtown, N = 3,0091 Witchcliffe, N = 3,0091 Wollongong, N = 3,0401 Woomera, N = 3,0091
MinTemp 12 (9, 16) 13 (10, 16) 9 (5, 14) 14 (7, 20) 11 (7, 16) 7 (4, 10) 8 (4, 13) 17 (13, 20) 22 (19, 24) 7 (2, 12) 13 (7, 19) 15 (11, 19) 9 (6, 11) 24 (22, 25) 18 (14, 21) 9 (6, 12) 22 (17, 25) 8 (4, 12) 11 (9, 15) 10 (7, 13) 10 (6, 15) 14 (7, 19) 9 (6, 11) 3 (-1, 8) 14 (10, 18) 8 (5, 12) 16 (12, 19) 17 (15, 19) 9 (6, 13) 12 (9, 16) 13 (8, 17) 13 (9, 17) 13 (9, 16) 10 (7, 12) 12 (7, 17) 8 (5, 12) 9 (5, 14) 15 (11, 19) 15 (11, 19) 22 (17, 24) 7 (2, 12) 15 (8, 21) 9 (4, 15) 12 (9, 14) 10 (7, 13) 13 (9, 17) 11 (8, 14) 15 (12, 18) 13 (8, 18)
    Unknown 2 63 11 1 36 1 2 9 1 6 6 15 69 1 3 0 49 6 480 0 0 2 2 91 346 5 30 1 11 22 28 0 0 9 20 1 41 4 1 2 1 35 0 35 7 2 9 15 4
MaxTemp 22 (17, 28) 20 (18, 22) 22 (16, 29) 30 (23, 36) 23 (19, 28) 17 (12, 23) 21 (15, 27) 27 (23, 29) 30 (28, 31) 20 (15, 26) 26 (19, 32) 24 (21, 27) 18 (15, 23) 33 (32, 34) 26 (23, 29) 17 (14, 21) 35 (33, 37) 19 (15, 23) 20 (16, 24) 19 (15, 25) 24 (18, 31) 27 (21, 33) 19 (15, 23) 12 (6, 17) 24 (20, 28) 21 (16, 28) 23 (19, 26) 22 (20, 24) 21 (15, 27) 25 (20, 31) 24 (20, 29) 24 (20, 29) 25 (20, 30) 17 (14, 20) 24 (20, 29) 19 (16, 24) 24 (19, 29) 23 (20, 26) 23 (20, 27) 30 (27, 31) 20 (15, 26) 31 (24, 37) 23 (16, 29) 20 (17, 23) 20 (15, 25) 24 (20, 27) 21 (18, 25) 21 (18, 24) 26 (20, 33)
    Unknown 3 54 11 2 29 1 5 14 0 3 3 19 63 1 7 1 40 6 481 0 0 0 5 53 235 6 30 1 9 21 25 1 0 1 14 1 40 2 0 1 4 7 0 39 0 3 6 11 3
Rainfall 0.0 (0.0, 0.8) 0.0 (0.0, 1.8) 0.0 (0.0, 0.4) 0.0 (0.0, 0.0) 0.0 (0.0, 0.4) 0.0 (0.0, 1.2) 0.0 (0.0, 0.4) 0.0 (0.0, 0.6) 0.0 (0.0, 2.6) 0.0 (0.0, 0.2) 0.0 (0.0, 0.0) 0.0 (0.0, 2.0) 0.2 (0.0, 1.8) 0.0 (0.0, 1.8) 0.0 (0.0, 1.2) 0.0 (0.0, 1.0) 0.0 (0.0, 0.0) 0.0 (0.0, 0.8) 0.0 (0.0, 1.2) 0.0 (0.0, 0.8) 0.0 (0.0, 0.0) 0.0 (0.0, 0.0) 0.2 (0.0, 1.8) 0.0 (0.0, 1.8) 0.0 (0.0, 1.0) 0.0 (0.0, 0.2) 0.0 (0.0, 1.6) 0.2 (0.0, 1.8) 0.0 (0.0, 0.5) 0.0 (0.0, 0.2) 0.0 (0.0, 0.4) 0.0 (0.0, 0.2) 0.0 (0.0, 0.2) 0.2 (0.0, 2.6) 0.0 (0.0, 0.4) 0.0 (0.0, 0.8) 0.0 (0.0, 0.2) 0.0 (0.0, 1.4) 0.0 (0.0, 1.2) 0.0 (0.0, 0.2) 0.0 (0.0, 0.2) 0.0 (0.0, 0.0) 0.0 (0.0, 0.2) 0.2 (0.0, 2.4) 0.0 (0.0, 1.0) 0.0 (0.0, 1.6) 0.0 (0.0, 2.0) 0.0 (0.0, 1.0) 0.0 (0.0, 0.0)
    Unknown 102 24 29 8 81 12 6 32 52 18 21 56 67 0 60 5 18 12 758 0 2 155 9 133 84 9 75 45 6 247 75 0 0 13 58 9 46 7 4 7 41 56 33 187 10 456 57 58 18
1 Median (IQR)

Уберем отсутсвующие значения (NA)

DataSet2 <- DataSet2 %>% filter(!is.na(MinTemp)) # убираем NA из MinTemp
DataSet2 <- DataSet2 %>% filter(!is.na(MaxTemp)) # убираем NA из MaxTemp
DataSet2 <- DataSet2 %>% filter(!is.na(Rainfall)) # убираем NA из Rainfall
tbl2 <- DataSet2 %>% tbl_summary(by = Location) #описательная статистика
tbl2 
Characteristic Adelaide, N = 3,0891 Albany, N = 2,9281 Albury, N = 3,0021 AliceSprings, N = 3,0291 BadgerysCreek, N = 2,9191 Ballarat, N = 3,0271 Bendigo, N = 3,0301 Brisbane, N = 3,1451 Cairns, N = 2,9871 Canberra, N = 3,4111 Cobar, N = 2,9841 CoffsHarbour, N = 2,9441 Dartmoor, N = 2,9371 Darwin, N = 3,1911 GoldCoast, N = 2,9751 Hobart, N = 3,1871 Katherine, N = 1,4951 Launceston, N = 3,0221 Melbourne, N = 2,4341 MelbourneAirport, N = 3,0091 Mildura, N = 3,0071 Moree, N = 2,8521 MountGambier, N = 3,0271 MountGinini, N = 2,8571 Newcastle, N = 2,5651 Nhil, N = 1,5681 NorahHead, N = 2,9171 NorfolkIsland, N = 2,9621 Nuriootpa, N = 2,9841 PearceRAAF, N = 2,7311 Penrith, N = 2,9531 Perth, N = 3,1921 PerthAirport, N = 3,0091 Portland, N = 2,9861 Richmond, N = 2,9391 Sale, N = 2,9981 SalmonGums, N = 2,9431 Sydney, N = 3,3331 SydneyAirport, N = 3,0051 Townsville, N = 3,0311 Tuggeranong, N = 2,9941 Uluru, N = 1,5171 WaggaWagga, N = 2,9761 Walpole, N = 2,7961 Watsonia, N = 2,9941 Williamtown, N = 2,5481 Witchcliffe, N = 2,9431 Wollongong, N = 2,9741 Woomera, N = 2,9841
MinTemp 12 (9, 16) 13 (10, 16) 9 (5, 14) 14 (7, 20) 11 (7, 16) 7 (4, 10) 8 (4, 13) 17 (13, 20) 22 (19, 24) 7 (2, 12) 13 (7, 19) 15 (11, 18) 9 (6, 11) 24 (22, 25) 18 (14, 21) 9 (6, 12) 22 (17, 25) 8 (4, 12) 12 (9, 15) 10 (7, 13) 10 (6, 15) 13 (7, 19) 9 (6, 11) 3 (-1, 8) 14 (10, 18) 8 (5, 12) 16 (12, 19) 17 (15, 19) 9 (6, 13) 12 (9, 16) 13 (8, 17) 13 (9, 17) 13 (9, 16) 10 (7, 12) 12 (7, 17) 8 (5, 12) 9 (5, 14) 15 (11, 19) 15 (11, 19) 22 (17, 24) 7 (2, 12) 15 (8, 21) 9 (4, 15) 12 (10, 14) 10 (7, 13) 13 (9, 17) 11 (8, 14) 15 (12, 18) 13 (8, 18)
MaxTemp 22 (17, 28) 20 (18, 22) 22 (16, 29) 30 (23, 36) 23 (19, 28) 17 (12, 23) 21 (15, 27) 27 (23, 29) 30 (28, 31) 20 (15, 26) 26 (19, 32) 24 (21, 27) 18 (15, 23) 33 (32, 34) 26 (23, 28) 17 (14, 21) 35 (33, 37) 19 (15, 23) 20 (16, 24) 19 (15, 25) 24 (18, 31) 27 (21, 33) 19 (15, 23) 12 (6, 17) 24 (20, 28) 21 (16, 28) 23 (19, 26) 22 (20, 24) 21 (15, 27) 26 (21, 32) 24 (20, 29) 24 (20, 29) 25 (20, 30) 17 (14, 20) 24 (20, 29) 19 (16, 24) 24 (19, 29) 23 (20, 26) 23 (20, 27) 30 (27, 31) 20 (15, 26) 31 (24, 37) 22 (16, 29) 20 (17, 23) 20 (15, 25) 24 (20, 28) 21 (18, 25) 21 (18, 24) 26 (20, 33)
Rainfall 0.0 (0.0, 0.8) 0.0 (0.0, 1.8) 0.0 (0.0, 0.4) 0.0 (0.0, 0.0) 0.0 (0.0, 0.4) 0.0 (0.0, 1.2) 0.0 (0.0, 0.4) 0.0 (0.0, 0.6) 0.0 (0.0, 2.6) 0.0 (0.0, 0.2) 0.0 (0.0, 0.0) 0.0 (0.0, 2.0) 0.2 (0.0, 1.8) 0.0 (0.0, 1.8) 0.0 (0.0, 1.2) 0.0 (0.0, 1.0) 0.0 (0.0, 0.0) 0.0 (0.0, 0.8) 0.0 (0.0, 1.2) 0.0 (0.0, 0.8) 0.0 (0.0, 0.0) 0.0 (0.0, 0.0) 0.2 (0.0, 1.8) 0.0 (0.0, 1.8) 0.0 (0.0, 1.0) 0.0 (0.0, 0.2) 0.0 (0.0, 1.6) 0.2 (0.0, 1.8) 0.0 (0.0, 0.4) 0.0 (0.0, 0.2) 0.0 (0.0, 0.4) 0.0 (0.0, 0.2) 0.0 (0.0, 0.2) 0.2 (0.0, 2.6) 0.0 (0.0, 0.4) 0.0 (0.0, 0.8) 0.0 (0.0, 0.2) 0.0 (0.0, 1.4) 0.0 (0.0, 1.2) 0.0 (0.0, 0.2) 0.0 (0.0, 0.2) 0.0 (0.0, 0.0) 0.0 (0.0, 0.2) 0.2 (0.0, 2.4) 0.0 (0.0, 1.0) 0.0 (0.0, 1.6) 0.0 (0.0, 2.0) 0.0 (0.0, 1.0) 0.0 (0.0, 0.0)
1 Median (IQR)

Добавим статистику вывода (предварительно оставим только 2 штата)

# выбираем два штата Adelaide и Albury
DataSet2 <- DataSet2 %>% filter(Location == "Adelaide" | 
                                  Location == "Albury") 

tbl2 <- DataSet2 %>% tbl_summary(by = Location) %>% #описательная статистика
  add_p() #добавляем статистику вывода
tbl2
Characteristic Adelaide, N = 3,0891 Albury, N = 3,0021 p-value2
MinTemp 12.1 (9.1, 15.5) 9.1 (4.7, 14.3) <0.001
MaxTemp 22 (17, 28) 22 (16, 29) 0.020
Rainfall 0.0 (0.0, 0.8) 0.0 (0.0, 0.4) 0.4
1 Median (IQR)
2 Wilcoxon rank sum test

Проблемы

Какие могут возникнуть проблемы с данными

library(readxl)
DataSet3 <- read_excel("Data/Баёва данные.xlsx")
View(DataSet3)

DataSet3$`32 электрода` #обратите вни ание на "неудобное" название столбца
## [1] 44.29 40.00 39.05 34.29 40.48 51.43

ещё проблемы - двойное наименование столбцов

library(readxl)
DataSet4 <- read_excel("Data/Горбунова данные.xlsx")
## New names:
## • `` -> `...3`
## • `` -> `...5`
## • `` -> `...7`
## • `` -> `...9`
View(DataSet4)
summary(DataSet4)
##     Группа           Интактные             ...3               ЧМТ           
##  Length:8           Length:8           Length:8           Length:8          
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      ...5                Sc                ...7               Sc+T          
##  Length:8           Length:8           Length:8           Length:8          
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      ...9          
##  Length:8          
##  Class :character  
##  Mode  :character

ещё проблемы - выбор части данных

library(readxl)
DataSet4 <- read_excel("Data/Горбунова данные.xlsx", 
    range = "A2:C8")
summary(DataSet4)
##        №           Обучение        Проверка  
##  Min.   :1.00   Min.   : 5.00   Min.   :180  
##  1st Qu.:2.25   1st Qu.:11.25   1st Qu.:180  
##  Median :3.50   Median :15.50   Median :180  
##  Mean   :3.50   Mean   :19.00   Mean   :180  
##  3rd Qu.:4.75   3rd Qu.:19.00   3rd Qu.:180  
##  Max.   :6.00   Max.   :48.00   Max.   :180