ONLY DO WHAT ONLY YOU CAN DO

こけたら立ちなはれ 立ったら歩きなはれ

「2014 FIFA ワールドカップ Brasil」を主成分分析

「Copa do Mundo de Futebol FIFA Brasil 2014」を主成分分析してみた。

データは、ここから入手。
https://www.whoscored.com/Statistics
こんな風な tab 区切りファイルに保存して

Team	Rank	Rating	Shotsconceded 	Shots	Tackles	CaughtOffside	Blocks	Interception	Clearances	Save	Goals	Dribbles	PossessionLoss	AerialWon	AerialLost	Passes	KeyPpasses	Assists	Fouls	Fouled
Algeria	10	6.92	15.5	9	20.8	1.5	10.8	16.3	29.5	4.8	1.5	6.5	19.8	14.8	22	326	6.5	1.3	17.3	13.5
Argentina	4	7.12	11.1	15.4	19	1.4	12.5	14.1	26.7	2.4	0.9	12.3	24	10.4	9.7	461.3	10.4	0.3	11	16.6
Australia	30	6.39	11.3	9	12.3	1	12	15.3	21	2.7	1	10.3	24.4	12.7	10.7	401.7	7.3	0.7	16.7	13
...省略...
Switzerland	1	7.14	18.3	16.3	20	1.3	15.8	12	24.5	4.3	1.8	6.5	23.6	11	10.3	392	13	1.5	16.5	15.8
Uruguay	26	6.63	10.8	11.8	18.5	2.8	12.3	15.8	24.3	2.3	1	5.5	23.3	18.3	17.5	346	7.8	0.5	18.3	15.8
USA	11	6.91	23.5	11	20	1	17.1	13.5	38.5	5.8	1	9	17.5	15	16.3	385.3	5.8	0.8	12.3	14

R に読み込み

d <- read.table("WorldCup2014TeamStatistics.txt", header=T)

rownames(d) <- c("Algeria",
"Argentina",
"Australia",
"Belgium",
"Bosnia_and_Herzegovina",
"Brazil",
"Cameroon",
"Chile",
"Colombia",
"Costa_Rica",
"Croatia",
"Ecuador",
"England",
"France",
"Germany",
"Ghana",
"Greece",
"Honduras",
"Iran",
"Italy",
"Ivory_Coast",
"Japan",
"Mexico",
"Netherlands",
"Nigeria",
"Portugal",
"Russia",
"South_Korea",
"Spain",
"Switzerland",
"Uruguay",
"USA")

主成分分析

p1 <- prcomp(d[,4:21], 
    scale=TRUE
)
biplot(
    p1
)

f:id:fornext1119:20180626124514p:plain

ggplot2 で 散布図を描画

p1_loading <- data.frame(t(cor(p1$x,d[,4:21])))
p1_scores <- data.frame(p1$x)

# フォントを準備
windowsFonts(HGKAI=windowsFont("HG正楷書体-PRO"))
windowsFonts(COURIER=windowsFont("Courier New"))

# ggplot2 パッケージを使用
library("ggplot2")
# ggrepel パッケージを使用
library("ggrepel")

g <- ggplot()
g <- g + geom_point(
    data=p1_scores,
    aes(
        x=p1_scores$PC1, 
        y=p1_scores$PC2, 
        size=(p1_scores$PC3), 
        alpha=(p1_scores$PC4)
    ),
    colour="blue"
)
# 国名を表示
g <- g + geom_text_repel(
    data=p1_scores,
    aes(
        x=p1_scores$PC1, 
        y=p1_scores$PC2, 
        label=rownames(p1_scores),
        family="COURIER"
    ),
    size=3
)
# 主成分負荷量を表示
g <- g + geom_text_repel(
    data=p1_loading,
    aes(
        x=(p1_loading$PC1 * 4),
        y=(p1_loading$PC2 * 4), 
        label=rownames(p1_loading) ,
        family="COURIER"
    ),
    size=3,
    colour="green4",
    alpha=0.8
)
#  主成分負荷量の矢印を描画
g <- g + geom_segment(
    data=p1_loading,
    aes(
        x=0,
        y=0,
        xend=(p1_loading$PC1 * 4),
        yend=(p1_loading$PC2 * 4)
    ),
    colour="green4",
    arrow=arrow(
        angle = 20, 
        length = unit(0.15, "inches"),
        type = "open"
    ),
    alpha=0.3,
    size=1
)
# タイトルを変更
g <- g + labs(title="主成分分析")
g <- g + xlab("第1主成分")
g <- g + ylab("第2主成分")
# フォントを指定
g <- g + theme_bw(
    base_size=15, 
    base_family="HGKAI"
)
g <- g + theme(
    legend.position="none",
    plot.title=element_text(
        hjust=0.5
    ),
    axis.title=element_text(
        size=12
    ),
    axis.text.x=element_text(
        family="COURIER", 
        size=10
    ),
    axis.text.y=element_text(
        family="COURIER", 
        size=10
    )
)
print(g)

f:id:fornext1119:20180626124805p:plain