###For chapter 06
phyto_metadata <- readRDS("phyto_metadata.obj")
species_ryuko_data <- readRDS("phyto_ryuko_data.obj")
metadata_ecoplate <- readRDS("metadata_ecopl.obj")
summary_ecoplate <- readRDS("summary_ecopl.obj")
#from basic_graphics.R 種数について再度計算
species_richness <- apply(species_ryuko_data > 0, 1, sum)
#subset関数を使うと条件を満たす要素を抜き出せる
subset(species_richness, phyto_metadata$year == "2018")
20180316_1_ryuko.csv 20180316_2_ryuko.csv 20181004_1_ryuko.csv
16 17 43
20181004_2_ryuko.csv 20181004_3_ryuko.csv 20181025_1_ryuko.csv
32 36 23
20181025_2_ryuko.csv
32
subset(species_ryuko_data, phyto_metadata$year == "2018")
#subset関数で抜き出した要素を、左辺のオブジェクトに格納(代入)する。
species_richness2018 <- subset(species_richness, phyto_metadata$year == "2018")
species_richness2019 <- subset(species_richness, phyto_metadata$year == "2019")
#抜き出したデータの要約統計量
summary(species_richness2018)
Min. 1st Qu. Median Mean 3rd Qu. Max.
16.00 20.00 32.00 28.43 34.00 43.00
summary(species_richness2019)
Min. 1st Qu. Median Mean 3rd Qu. Max.
19.00 24.25 25.50 26.50 27.50 37.00
#要約統計量
tapply(species_richness, phyto_metadata$year, summary)
$`2018`
Min. 1st Qu. Median Mean 3rd Qu. Max.
16.00 20.00 32.00 28.43 34.00 43.00
$`2019`
Min. 1st Qu. Median Mean 3rd Qu. Max.
19.00 24.25 25.50 26.50 27.50 37.00
#平均
tapply(species_richness, phyto_metadata$year, mean)
2018 2019
28.42857 26.50000
#分散
tapply(species_richness, phyto_metadata$year, var)
2018 2019
101.61905 31.71429
#Welch(ウェルチ)の2標本t検定を使うのが最近のデフォルト
#公式(Y~X)の形で書く
t.test(species_richness ~ phyto_metadata$year, var.equal = F)
Welch Two Sample t-test
data: species_richness by phyto_metadata$year
t = 0.44861, df = 9.1401, p-value = 0.6642
alternative hypothesis: true difference in means between group 2018 and group 2019 is not equal to 0
95 percent confidence interval:
-7.77373 11.63087
sample estimates:
mean in group 2018 mean in group 2019
28.42857 26.50000
種組成のデータ=各種の個体数のデータを各行(1)に対して、全種について足し合わせて(sum),全個体数を計算する
total_abundance <- apply(species_ryuko_data,1,sum)
boxplot( #箱ひげ図
total_abundance ~ phyto_metadata$year, #公式スタイル
outline = TRUE, #外れ値の表示
col = "white" #箱の色指定
)
stripchart( #一次元散布図
total_abundance ~ phyto_metadata$year, #公式スタイル
method = "stack", #重複する点は規則的に並べる
pch = c(1,2), #点のマーク(記号)の指定
cex = 3, #マークのサイズ
vertical = TRUE, #垂直(vertical)方向に散布図を延ばす
add = TRUE #上の箱ひげ図に重ねて描画
)
#年ごとに総個体数(total_abundance)の平均(mean)を計算
tapply(total_abundance, phyto_metadata$year, mean)
2018 2019
462.1429 205.5000
#Welchのt検定
t.test(total_abundance ~ phyto_metadata$year, var.equal = F)
Welch Two Sample t-test
data: total_abundance by phyto_metadata$year
t = 2.392, df = 6.7469, p-value = 0.04933
alternative hypothesis: true difference in means between group 2018 and group 2019 is not equal to 0
95 percent confidence interval:
0.9986613 512.2870530
sample estimates:
mean in group 2018 mean in group 2019
462.1429 205.5000
#箱ひげ図と一次元散布図の重ね合わせ
#因子(要因)phyto_matadata$monthは、as.factor関数を使って明示的に「因子」にする
boxplot(species_richness ~ as.factor(phyto_metadata$month),
outline = FALSE, #外れ値は明示しない
col = "white",
xlab = "month",
ylab = "species richness",
ylim = c(0,40) #Y軸の表示範囲(最小値,最大値)を指定
)
stripchart(
species_richness ~ as.factor(phyto_metadata$month),
method = "stack",
cex = 2,
vertical = TRUE,
add = TRUE
)
#anova(lm(応答変数 ~ 説明変数(説明因子))の形式で書く
anova(lm(species_richness ~ as.factor(phyto_metadata$month)))
Analysis of Variance Table
Response: species_richness
Df Sum Sq Mean Sq F value Pr(>F)
as.factor(phyto_metadata$month) 3 418.83 139.611 3.5985 0.04961 *
Residuals 11 426.77 38.797
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#組み込みデータInsectSpraysの読み込み
data("InsectSprays")
#データの中身を表示
View(InsectSprays)
boxplot(count ~ spray, data = InsectSprays, #いままでと違う書き方
outline = FALSE,
col = "white",
xlab = "spray type",
ylab = "insect count"
)
stripchart(
count ~ as.factor(spray), data = InsectSprays, #いままでと違う書き方
method = "stack",
cex = 2,
vertical = TRUE,
add = TRUE
)
anova(lm(count ~ spray, data = InsectSprays))
Analysis of Variance Table
Response: count
Df Sum Sq Mean Sq F value Pr(>F)
spray 5 2668.8 533.77 34.702 < 2.2e-16 ***
Residuals 66 1015.2 15.38
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
plot(
species_richness ~ phyto_metadata$temp,
type="p",
cex = 3,
xlab = "temperature",
ylab = "species richness"
)
#植物プランクトンの種数(species_richness)と水温(temp)の相関
cor(species_richness, phyto_metadata$temp)
[1] 0.701338
#微生物の異なる炭素基質活性間の相関
#3列以上のデータフレームを引数にすると総当たりのペアでの相関が計算される
cor(summary_ecoplate[,1:5])
s01 s02 s03 s04 s05
s01 1.0000000 -0.04709880 0.1032132 -0.48970438 -0.2143492
s02 -0.0470988 1.00000000 0.3721024 -0.05531352 0.3005611
s03 0.1032132 0.37210238 1.0000000 0.14759165 0.2211696
s04 -0.4897044 -0.05531352 0.1475916 1.00000000 0.4894126
s05 -0.2143492 0.30056114 0.2211696 0.48941263 1.0000000
#スペルマン(spearman)の順位相関係数
cor(species_richness, phyto_metadata$temp, method = "spearman")
[1] 0.6957425
cor.test(species_richness, phyto_metadata$temp)
Pearson's product-moment correlation
data: species_richness and phyto_metadata$temp
t = 3.5474, df = 13, p-value = 0.003574
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.2950930 0.8928332
sample estimates:
cor
0.701338
#Y~Xという公式で「YをXで回帰する」という設定になる
#lm関数の出力をmodel01というオブジェクトに格納(代入 <- )
model01 <- lm(species_richness ~ phyto_metadata$temp)
#lm関数の出力を格納したmodel01の結果のまとめ
summary(model01)
Call:
lm(formula = species_richness ~ phyto_metadata$temp)
Residuals:
Min 1Q Median 3Q Max
-8.1291 -4.5127 -0.4173 1.8918 13.3918
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.9212 5.9606 1.161 0.26646
phyto_metadata$temp 1.1751 0.3313 3.547 0.00357 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 5.749 on 13 degrees of freedom
Multiple R-squared: 0.4919, Adjusted R-squared: 0.4528
F-statistic: 12.58 on 1 and 13 DF, p-value: 0.003574
plot( #plot関数の引数を従来通り設定して散布図を描く
species_richness ~ phyto_metadata$temp,
type = "p",
cex = 3,
xlab = "temperature",
ylab = "species richness"
)
#回帰直線をabline(モデルの結果を格納したオブジェクト, 線の色)の形で重ねて描画する
abline(model01,col = 4)
#回帰係数は元データの値を2倍や10倍にしても変わらない
cor(phyto_metadata$temp, species_richness)
[1] 0.701338
#片方を2倍にしても・・・
cor(phyto_metadata$temp*2, species_richness)
[1] 0.701338
#他方を10倍にしても・・・
cor(phyto_metadata$temp, species_richness*10)
[1] 0.701338
temp2 <- 2*phyto_metadata$temp #水温データを2倍にする
species_richness10 <- 10*species_richness #種数データを10倍にする
#元の回帰
lm(species_richness ~ phyto_metadata$temp)
Call:
lm(formula = species_richness ~ phyto_metadata$temp)
Coefficients:
(Intercept) phyto_metadata$temp
6.921 1.175
#水温を2倍にした場合
lm(species_richness ~ temp2)
Call:
lm(formula = species_richness ~ temp2)
Coefficients:
(Intercept) temp2
6.9212 0.5876
#種数を10倍にした場合
lm(species_richness10 ~ phyto_metadata$temp)
Call:
lm(formula = species_richness10 ~ phyto_metadata$temp)
Coefficients:
(Intercept) phyto_metadata$temp
69.21 11.75
#相関の場合XとYを入れ替えても相関係数は変わらない
cor(phyto_metadata$temp, species_richness)
[1] 0.701338
cor(species_richness, phyto_metadata$temp)
[1] 0.701338
#回帰についてXとYを入れ替えてみた場合
model02 <- lm(phyto_metadata$temp ~ species_richness)
#回帰モデルの結果のうち、推定された係数(coefficients)のみを抜き出す
model02$coefficients
(Intercept) species_richness
5.9579391 0.4185667
#tempによるspecies_richnessの回帰直線とspecies_richnessによるtempの回帰直線を重ねて描画してみる
# phyto_metadata$temp = 5.9579 + 0.4186*species_richness
#この式を変形して・・・
# species_richness = -0.9579/0.4186 + (1.0/0.4186)*phyto_metadata$temp
plot(
species_richness ~ phyto_metadata$temp,
type="p",
cex = 3,
xlab = "temperature",
ylab = "species richness"
)
#tempによるspecies_richnessの回帰直線(水色実践)
abline(model01,col = 4)
#species_richnessによるtempの回帰直線(黒色破線)
abline(c(-model02$coefficients[1]/model02$coefficients[2], 1.0/model02$coefficients[2]),lty=2)
data(anscombe) #Rに組み込まれているデータの読み込み
#4つの異なる組合せで回帰分析
model_a01 <- lm(y1~x1, data = anscombe)
model_a02 <- lm(y2~x2, data = anscombe)
model_a03 <- lm(y3~x3, data = anscombe)
model_a04 <- lm(y4~x4, data = anscombe)
#推定された回帰係数等の結果が同じになることを確認してみよう
summary(model_a01)
Call:
lm(formula = y1 ~ x1, data = anscombe)
Residuals:
Min 1Q Median 3Q Max
-1.92127 -0.45577 -0.04136 0.70941 1.83882
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.0001 1.1247 2.667 0.02573 *
x1 0.5001 0.1179 4.241 0.00217 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.237 on 9 degrees of freedom
Multiple R-squared: 0.6665, Adjusted R-squared: 0.6295
F-statistic: 17.99 on 1 and 9 DF, p-value: 0.00217
summary(model_a02)
Call:
lm(formula = y2 ~ x2, data = anscombe)
Residuals:
Min 1Q Median 3Q Max
-1.9009 -0.7609 0.1291 0.9491 1.2691
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.001 1.125 2.667 0.02576 *
x2 0.500 0.118 4.239 0.00218 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.237 on 9 degrees of freedom
Multiple R-squared: 0.6662, Adjusted R-squared: 0.6292
F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002179
summary(model_a03)
Call:
lm(formula = y3 ~ x3, data = anscombe)
Residuals:
Min 1Q Median 3Q Max
-1.1586 -0.6146 -0.2303 0.1540 3.2411
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.0025 1.1245 2.670 0.02562 *
x3 0.4997 0.1179 4.239 0.00218 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.236 on 9 degrees of freedom
Multiple R-squared: 0.6663, Adjusted R-squared: 0.6292
F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002176
summary(model_a04)
Call:
lm(formula = y4 ~ x4, data = anscombe)
Residuals:
Min 1Q Median 3Q Max
-1.751 -0.831 0.000 0.809 1.839
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.0017 1.1239 2.671 0.02559 *
x4 0.4999 0.1178 4.243 0.00216 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.236 on 9 degrees of freedom
Multiple R-squared: 0.6667, Adjusted R-squared: 0.6297
F-statistic: 18 on 1 and 9 DF, p-value: 0.002165
#可視化したら全然違うパターンになっている
plot(
y1~x1, data = anscombe,
type = "p",
cex = 3,
xlim = c(0, 20),
ylim = c(0, 15),
xlab = "",
ylab = ""
)
abline(model_a01)
plot(
y2~x2, data = anscombe,
type = "p",
cex = 3,
xlim = c(0, 20),
ylim = c(0, 15),
xlab = "",
ylab = ""
)
abline(model_a02)
plot(
y3~x3, data = anscombe,
type = "p",
cex = 3,
xlim = c(0, 20),
ylim = c(0, 15),
xlab = "",
ylab = ""
)
abline(model_a03)
plot(
y4~x4, data = anscombe,
type = "p",
cex = 3,
xlim = c(0, 20),
ylim = c(0, 15),
xlab = "",
ylab = ""
)
abline(model_a04)
#CSVファイルからデータ読み込み
data_s <- read.csv("DatasaurusDozen.csv", header=T)
#異なる二つのデータペアを抜き出す
data_s01 <- subset(data_s, dataset == "circle")[2:3]
data_s02 <- subset(data_s, dataset == "dino")[2:3]
#相関係数に違いがないことを確かめよう
cor(data_s01)
x y
x 1.00000000 -0.06834336
y -0.06834336 1.00000000
cor(data_s02)
x y
x 1.00000000 -0.06447185
y -0.06447185 1.00000000
plot(data_s01)
plot(data_s02)