ONLY DO WHAT ONLY YOU CAN DO

こけたら立ちなはれ 立ったら歩きなはれ

Rでクラスター分析~少年サッカー データ分析~

Rでクラスター分析

距離の計算方法とか、分類方法とか色々あって、何を選択すればよいのか、アルゴリズムの説明ではピンと来ない。
ちなみに、ここの説明が詳しい
www.albert2005.co.jp

まずは、色々ためしてみる。

setwd("C:/data")
d <- read.table("stats.txt", header=T)

par.save <- par(mfrow=c(2,2))

ウォード法 x さまざまな距離

# ユークリッド距離
d_dist_euclidean <- dist(d, method="Euclidean"  )
d_clust_ward <- hclust(d_dist_euclidean, "ward.D")
plot(d_clust_ward)
rect.hclust(d_clust_ward, k = 3)

#マハラノビス距離
d_dist_mahalanobis <- dist(d, method="Mahalanobis"  )
d_clust_ward <- hclust(d_dist_mahalanobis, "ward.D")
plot(d_clust_ward)
rect.hclust(d_clust_ward, k = 3)

#マンハッタン距離
d_dist_manhattan <- dist(d, method="Manhattan"  )
d_clust_ward <- hclust(d_dist_manhattan, "ward.D")
plot(d_clust_ward)
rect.hclust(d_clust_ward, k = 3)

#チェビシェフ距離
d_dist_chebyshev <- dist(d, method="Chebyshev"  )
d_clust_ward <- hclust(d_dist_chebyshev, "ward.D")
plot(d_clust_ward)
rect.hclust(d_clust_ward, k = 3)

#キャンベラ距離
d_dist_canberra <- dist(d, method="Canberra"  )
d_clust_ward <- hclust(d_dist_canberra, "ward.D")
plot(d_clust_ward)
rect.hclust(d_clust_ward, k = 3)

#拡張Jaccard類似度による距離
d_dist_ejaccard <- dist(d, method="eJaccard")
d_clust_ward <- hclust(d_dist_ejaccard, "ward.D")
plot(d_clust_ward)
rect.hclust(d_clust_ward, k = 3)

#余弦類似度による距離による距離
d_dist_cosine <- dist(d, method="cosine")
d_clust_ward <- hclust(d_dist_cosine, "ward.D")
plot(d_clust_ward)
rect.hclust(d_clust_ward, k = 3)

#ピアソンの積率相関係数による距離
d_dist_correlation <- dist(d, method="correlation")
d_clust_ward <- hclust(d_dist_correlation, "ward.D")
plot(d_clust_ward)
rect.hclust(d_clust_ward, k = 3)

f:id:fornext1119:20180509225951p:plain
f:id:fornext1119:20180509230233p:plain

最近隣法(最短距離法) x さまざまな距離

# ユークリッド距離
d_dist_euclidean <- dist(d, method="Euclidean"  )
d_clust_single <- hclust(d_dist_euclidean, "single")
plot(d_clust_single)
rect.hclust(d_clust_single, k = 3)

#マハラノビス距離
d_dist_mahalanobis <- dist(d, method="Mahalanobis"  )
d_clust_single <- hclust(d_dist_mahalanobis, "single")
plot(d_clust_single)
rect.hclust(d_clust_single, k = 3)

#マンハッタン距離
d_dist_manhattan <- dist(d, method="Manhattan"  )
d_clust_single <- hclust(d_dist_manhattan, "single")
plot(d_clust_single)
rect.hclust(d_clust_single, k = 3)

#チェビシェフ距離
d_dist_chebyshev <- dist(d, method="Chebyshev"  )
d_clust_single <- hclust(d_dist_chebyshev, "single")
plot(d_clust_single)
rect.hclust(d_clust_single, k = 3)

#キャンベラ距離
d_dist_canberra <- dist(d, method="Canberra"  )
d_clust_single <- hclust(d_dist_canberra, "single")
plot(d_clust_single)
rect.hclust(d_clust_single, k = 3)

#拡張Jaccard類似度による距離
d_dist_ejaccard <- dist(d, method="eJaccard")
d_clust_single <- hclust(d_dist_ejaccard, "single")
plot(d_clust_single)
rect.hclust(d_clust_single, k = 3)

#余弦類似度による距離
d_dist_cosine <- dist(d, method="cosine")
d_clust_single <- hclust(d_dist_cosine, "single")
plot(d_clust_single)
rect.hclust(d_clust_single, k = 3)

#ピアソンの積率相関係数による距離
d_dist_correlation <- dist(d, method="correlation")
d_clust_single <- hclust(d_dist_correlation, "single")
plot(d_clust_single)
rect.hclust(d_clust_single, k = 3)

f:id:fornext1119:20180509231725p:plain
f:id:fornext1119:20180509231734p:plain

最遠隣法(最長距離法) x さまざまな距離

# ユークリッド距離
d_dist_euclidean <- dist(d, method="Euclidean"  )
d_clust_complete <- hclust(d_dist_euclidean, "complete")
plot(d_clust_complete)
rect.hclust(d_clust_complete, k = 3)

#マハラノビス距離
d_dist_mahalanobis <- dist(d, method="Mahalanobis"  )
d_clust_complete <- hclust(d_dist_mahalanobis, "complete")
plot(d_clust_complete)
rect.hclust(d_clust_complete, k = 3)

#マンハッタン距離
d_dist_manhattan <- dist(d, method="Manhattan"  )
d_clust_complete <- hclust(d_dist_manhattan, "complete")
plot(d_clust_complete)
rect.hclust(d_clust_complete, k = 3)

#チェビシェフ距離
d_dist_chebyshev <- dist(d, method="Chebyshev"  )
d_clust_complete <- hclust(d_dist_chebyshev, "complete")
plot(d_clust_complete)
rect.hclust(d_clust_complete, k = 3)

#キャンベラ距離
d_dist_canberra <- dist(d, method="Canberra"  )
d_clust_complete <- hclust(d_dist_canberra, "complete")
plot(d_clust_complete)
rect.hclust(d_clust_complete, k = 3)

#拡張Jaccard類似度による距離
d_dist_ejaccard <- dist(d, method="eJaccard")
d_clust_complete <- hclust(d_dist_ejaccard, "complete")
plot(d_clust_complete)
rect.hclust(d_clust_complete, k = 3)

#余弦類似度による距離
d_dist_cosine <- dist(d, method="cosine")
d_clust_complete <- hclust(d_dist_cosine, "complete")
plot(d_clust_complete)
rect.hclust(d_clust_complete, k = 3)

#ピアソンの積率相関係数による距離
d_dist_correlation <- dist(d, method="correlation")
d_clust_complete <- hclust(d_dist_correlation, "complete")
plot(d_clust_complete)
rect.hclust(d_clust_complete, k = 3)

f:id:fornext1119:20180509231758p:plain
f:id:fornext1119:20180509231813p:plain

群平均法 x さまざまな距離

# ユークリッド距離
d_dist_euclidean <- dist(d, method="Euclidean"  )
d_clust_average <- hclust(d_dist_euclidean, "average")
plot(d_clust_average)
rect.hclust(d_clust_average, k = 3)

#マハラノビス距離
d_dist_mahalanobis <- dist(d, method="Mahalanobis"  )
d_clust_average <- hclust(d_dist_mahalanobis, "average")
plot(d_clust_average)
rect.hclust(d_clust_average, k = 3)

#マンハッタン距離
d_dist_manhattan <- dist(d, method="Manhattan"  )
d_clust_average <- hclust(d_dist_manhattan, "average")
plot(d_clust_average)
rect.hclust(d_clust_average, k = 3)

#チェビシェフ距離
d_dist_chebyshev <- dist(d, method="Chebyshev"  )
d_clust_average <- hclust(d_dist_chebyshev, "average")
plot(d_clust_average)
rect.hclust(d_clust_average, k = 3)

#キャンベラ距離
d_dist_canberra <- dist(d, method="Canberra"  )
d_clust_average <- hclust(d_dist_canberra, "average")
plot(d_clust_average)
rect.hclust(d_clust_average, k = 3)

#拡張Jaccard類似度による距離
d_dist_ejaccard <- dist(d, method="eJaccard")
d_clust_average <- hclust(d_dist_ejaccard, "average")
plot(d_clust_average)
rect.hclust(d_clust_average, k = 3)

#余弦類似度による距離
d_dist_cosine <- dist(d, method="cosine")
d_clust_average <- hclust(d_dist_cosine, "average")
plot(d_clust_average)
rect.hclust(d_clust_average, k = 3)

#ピアソンの積率相関係数による距離
d_dist_correlation <- dist(d, method="correlation")
d_clust_average <- hclust(d_dist_correlation, "average")
plot(d_clust_average)
rect.hclust(d_clust_average, k = 3)

f:id:fornext1119:20180509231827p:plain
f:id:fornext1119:20180509231839p:plain

重心法 x さまざまな距離

# ユークリッド距離
d_dist_euclidean <- dist(d, method="Euclidean"  )
d_clust_centroid <- hclust(d_dist_euclidean, "centroid")
plot(d_clust_centroid)
rect.hclust(d_clust_centroid, k = 3)

#マハラノビス距離
d_dist_mahalanobis <- dist(d, method="Mahalanobis"  )
d_clust_centroid <- hclust(d_dist_mahalanobis, "centroid")
plot(d_clust_centroid)
rect.hclust(d_clust_centroid, k = 3)

#マンハッタン距離
d_dist_manhattan <- dist(d, method="Manhattan"  )
d_clust_centroid <- hclust(d_dist_manhattan, "centroid")
plot(d_clust_centroid)
rect.hclust(d_clust_centroid, k = 3)

#チェビシェフ距離
d_dist_chebyshev <- dist(d, method="Chebyshev"  )
d_clust_centroid <- hclust(d_dist_chebyshev, "centroid")
plot(d_clust_centroid)
rect.hclust(d_clust_centroid, k = 3)

#キャンベラ距離
d_dist_canberra <- dist(d, method="Canberra"  )
d_clust_centroid <- hclust(d_dist_canberra, "centroid")
plot(d_clust_centroid)
rect.hclust(d_clust_centroid, k = 3)

#拡張Jaccard類似度による距離
d_dist_ejaccard <- dist(d, method="eJaccard")
d_clust_centroid <- hclust(d_dist_ejaccard, "centroid")
plot(d_clust_centroid)
rect.hclust(d_clust_centroid, k = 3)

#余弦類似度による距離
d_dist_cosine <- dist(d, method="cosine")
d_clust_centroid <- hclust(d_dist_cosine, "centroid")
plot(d_clust_centroid)
rect.hclust(d_clust_centroid, k = 3)

#ピアソンの積率相関係数による距離
d_dist_correlation <- dist(d, method="correlation")
d_clust_centroid <- hclust(d_dist_correlation, "centroid")
plot(d_clust_centroid)
rect.hclust(d_clust_centroid, k = 3)

f:id:fornext1119:20180509232036p:plain
f:id:fornext1119:20180509232045p:plain

重み付き重心法(メディアン法) x さまざまな距離

# ユークリッド距離
d_dist_euclidean <- dist(d, method="Euclidean"  )
d_clust_median <- hclust(d_dist_euclidean, "median")
plot(d_clust_median)
rect.hclust(d_clust_median, k = 3)

#マハラノビス距離
d_dist_mahalanobis <- dist(d, method="Mahalanobis"  )
d_clust_median <- hclust(d_dist_mahalanobis, "median")
plot(d_clust_median)
rect.hclust(d_clust_median, k = 3)

#マンハッタン距離
d_dist_manhattan <- dist(d, method="Manhattan"  )
d_clust_median <- hclust(d_dist_manhattan, "median")
plot(d_clust_median)
rect.hclust(d_clust_median, k = 3)

#チェビシェフ距離
d_dist_chebyshev <- dist(d, method="Chebyshev"  )
d_clust_median <- hclust(d_dist_chebyshev, "median")
plot(d_clust_median)
rect.hclust(d_clust_median, k = 3)

#キャンベラ距離
d_dist_canberra <- dist(d, method="Canberra"  )
d_clust_median <- hclust(d_dist_canberra, "median")
plot(d_clust_median)
rect.hclust(d_clust_median, k = 3)

#拡張Jaccard類似度による距離
d_dist_ejaccard <- dist(d, method="eJaccard")
d_clust_median <- hclust(d_dist_ejaccard, "median")
plot(d_clust_median)
rect.hclust(d_clust_median, k = 3)

#余弦類似度による距離
d_dist_cosine <- dist(d, method="cosine")
d_clust_median <- hclust(d_dist_cosine, "median")
plot(d_clust_median)
rect.hclust(d_clust_median, k = 3)

#ピアソンの積率相関係数による距離
d_dist_correlation <- dist(d, method="correlation")
d_clust_median <- hclust(d_dist_correlation, "median")
plot(d_clust_median)
rect.hclust(d_clust_median, k = 3)

f:id:fornext1119:20180509232257p:plain
f:id:fornext1119:20180509232313p:plain

 par(par.save)