Rでクラスター分析~少年サッカー データ分析~
Rでクラスター分析
距離の計算方法とか、分類方法とか色々あって、何を選択すればよいのか、アルゴリズムの説明ではピンと来ない。
ちなみに、ここの説明が詳しい
www.albert2005.co.jp
まずは、色々ためしてみる。
setwd("C:/data") d <- read.table("stats.txt", header=T) par.save <- par(mfrow=c(2,2))
ウォード法 x さまざまな距離
# ユークリッド距離 d_dist_euclidean <- dist(d, method="Euclidean" ) d_clust_ward <- hclust(d_dist_euclidean, "ward.D") plot(d_clust_ward) rect.hclust(d_clust_ward, k = 3) #マハラノビス距離 d_dist_mahalanobis <- dist(d, method="Mahalanobis" ) d_clust_ward <- hclust(d_dist_mahalanobis, "ward.D") plot(d_clust_ward) rect.hclust(d_clust_ward, k = 3) #マンハッタン距離 d_dist_manhattan <- dist(d, method="Manhattan" ) d_clust_ward <- hclust(d_dist_manhattan, "ward.D") plot(d_clust_ward) rect.hclust(d_clust_ward, k = 3) #チェビシェフ距離 d_dist_chebyshev <- dist(d, method="Chebyshev" ) d_clust_ward <- hclust(d_dist_chebyshev, "ward.D") plot(d_clust_ward) rect.hclust(d_clust_ward, k = 3) #キャンベラ距離 d_dist_canberra <- dist(d, method="Canberra" ) d_clust_ward <- hclust(d_dist_canberra, "ward.D") plot(d_clust_ward) rect.hclust(d_clust_ward, k = 3) #拡張Jaccard類似度による距離 d_dist_ejaccard <- dist(d, method="eJaccard") d_clust_ward <- hclust(d_dist_ejaccard, "ward.D") plot(d_clust_ward) rect.hclust(d_clust_ward, k = 3) #余弦類似度による距離による距離 d_dist_cosine <- dist(d, method="cosine") d_clust_ward <- hclust(d_dist_cosine, "ward.D") plot(d_clust_ward) rect.hclust(d_clust_ward, k = 3) #ピアソンの積率相関係数による距離 d_dist_correlation <- dist(d, method="correlation") d_clust_ward <- hclust(d_dist_correlation, "ward.D") plot(d_clust_ward) rect.hclust(d_clust_ward, k = 3)
最近隣法(最短距離法) x さまざまな距離
# ユークリッド距離 d_dist_euclidean <- dist(d, method="Euclidean" ) d_clust_single <- hclust(d_dist_euclidean, "single") plot(d_clust_single) rect.hclust(d_clust_single, k = 3) #マハラノビス距離 d_dist_mahalanobis <- dist(d, method="Mahalanobis" ) d_clust_single <- hclust(d_dist_mahalanobis, "single") plot(d_clust_single) rect.hclust(d_clust_single, k = 3) #マンハッタン距離 d_dist_manhattan <- dist(d, method="Manhattan" ) d_clust_single <- hclust(d_dist_manhattan, "single") plot(d_clust_single) rect.hclust(d_clust_single, k = 3) #チェビシェフ距離 d_dist_chebyshev <- dist(d, method="Chebyshev" ) d_clust_single <- hclust(d_dist_chebyshev, "single") plot(d_clust_single) rect.hclust(d_clust_single, k = 3) #キャンベラ距離 d_dist_canberra <- dist(d, method="Canberra" ) d_clust_single <- hclust(d_dist_canberra, "single") plot(d_clust_single) rect.hclust(d_clust_single, k = 3) #拡張Jaccard類似度による距離 d_dist_ejaccard <- dist(d, method="eJaccard") d_clust_single <- hclust(d_dist_ejaccard, "single") plot(d_clust_single) rect.hclust(d_clust_single, k = 3) #余弦類似度による距離 d_dist_cosine <- dist(d, method="cosine") d_clust_single <- hclust(d_dist_cosine, "single") plot(d_clust_single) rect.hclust(d_clust_single, k = 3) #ピアソンの積率相関係数による距離 d_dist_correlation <- dist(d, method="correlation") d_clust_single <- hclust(d_dist_correlation, "single") plot(d_clust_single) rect.hclust(d_clust_single, k = 3)
最遠隣法(最長距離法) x さまざまな距離
# ユークリッド距離 d_dist_euclidean <- dist(d, method="Euclidean" ) d_clust_complete <- hclust(d_dist_euclidean, "complete") plot(d_clust_complete) rect.hclust(d_clust_complete, k = 3) #マハラノビス距離 d_dist_mahalanobis <- dist(d, method="Mahalanobis" ) d_clust_complete <- hclust(d_dist_mahalanobis, "complete") plot(d_clust_complete) rect.hclust(d_clust_complete, k = 3) #マンハッタン距離 d_dist_manhattan <- dist(d, method="Manhattan" ) d_clust_complete <- hclust(d_dist_manhattan, "complete") plot(d_clust_complete) rect.hclust(d_clust_complete, k = 3) #チェビシェフ距離 d_dist_chebyshev <- dist(d, method="Chebyshev" ) d_clust_complete <- hclust(d_dist_chebyshev, "complete") plot(d_clust_complete) rect.hclust(d_clust_complete, k = 3) #キャンベラ距離 d_dist_canberra <- dist(d, method="Canberra" ) d_clust_complete <- hclust(d_dist_canberra, "complete") plot(d_clust_complete) rect.hclust(d_clust_complete, k = 3) #拡張Jaccard類似度による距離 d_dist_ejaccard <- dist(d, method="eJaccard") d_clust_complete <- hclust(d_dist_ejaccard, "complete") plot(d_clust_complete) rect.hclust(d_clust_complete, k = 3) #余弦類似度による距離 d_dist_cosine <- dist(d, method="cosine") d_clust_complete <- hclust(d_dist_cosine, "complete") plot(d_clust_complete) rect.hclust(d_clust_complete, k = 3) #ピアソンの積率相関係数による距離 d_dist_correlation <- dist(d, method="correlation") d_clust_complete <- hclust(d_dist_correlation, "complete") plot(d_clust_complete) rect.hclust(d_clust_complete, k = 3)
群平均法 x さまざまな距離
# ユークリッド距離 d_dist_euclidean <- dist(d, method="Euclidean" ) d_clust_average <- hclust(d_dist_euclidean, "average") plot(d_clust_average) rect.hclust(d_clust_average, k = 3) #マハラノビス距離 d_dist_mahalanobis <- dist(d, method="Mahalanobis" ) d_clust_average <- hclust(d_dist_mahalanobis, "average") plot(d_clust_average) rect.hclust(d_clust_average, k = 3) #マンハッタン距離 d_dist_manhattan <- dist(d, method="Manhattan" ) d_clust_average <- hclust(d_dist_manhattan, "average") plot(d_clust_average) rect.hclust(d_clust_average, k = 3) #チェビシェフ距離 d_dist_chebyshev <- dist(d, method="Chebyshev" ) d_clust_average <- hclust(d_dist_chebyshev, "average") plot(d_clust_average) rect.hclust(d_clust_average, k = 3) #キャンベラ距離 d_dist_canberra <- dist(d, method="Canberra" ) d_clust_average <- hclust(d_dist_canberra, "average") plot(d_clust_average) rect.hclust(d_clust_average, k = 3) #拡張Jaccard類似度による距離 d_dist_ejaccard <- dist(d, method="eJaccard") d_clust_average <- hclust(d_dist_ejaccard, "average") plot(d_clust_average) rect.hclust(d_clust_average, k = 3) #余弦類似度による距離 d_dist_cosine <- dist(d, method="cosine") d_clust_average <- hclust(d_dist_cosine, "average") plot(d_clust_average) rect.hclust(d_clust_average, k = 3) #ピアソンの積率相関係数による距離 d_dist_correlation <- dist(d, method="correlation") d_clust_average <- hclust(d_dist_correlation, "average") plot(d_clust_average) rect.hclust(d_clust_average, k = 3)
重心法 x さまざまな距離
# ユークリッド距離 d_dist_euclidean <- dist(d, method="Euclidean" ) d_clust_centroid <- hclust(d_dist_euclidean, "centroid") plot(d_clust_centroid) rect.hclust(d_clust_centroid, k = 3) #マハラノビス距離 d_dist_mahalanobis <- dist(d, method="Mahalanobis" ) d_clust_centroid <- hclust(d_dist_mahalanobis, "centroid") plot(d_clust_centroid) rect.hclust(d_clust_centroid, k = 3) #マンハッタン距離 d_dist_manhattan <- dist(d, method="Manhattan" ) d_clust_centroid <- hclust(d_dist_manhattan, "centroid") plot(d_clust_centroid) rect.hclust(d_clust_centroid, k = 3) #チェビシェフ距離 d_dist_chebyshev <- dist(d, method="Chebyshev" ) d_clust_centroid <- hclust(d_dist_chebyshev, "centroid") plot(d_clust_centroid) rect.hclust(d_clust_centroid, k = 3) #キャンベラ距離 d_dist_canberra <- dist(d, method="Canberra" ) d_clust_centroid <- hclust(d_dist_canberra, "centroid") plot(d_clust_centroid) rect.hclust(d_clust_centroid, k = 3) #拡張Jaccard類似度による距離 d_dist_ejaccard <- dist(d, method="eJaccard") d_clust_centroid <- hclust(d_dist_ejaccard, "centroid") plot(d_clust_centroid) rect.hclust(d_clust_centroid, k = 3) #余弦類似度による距離 d_dist_cosine <- dist(d, method="cosine") d_clust_centroid <- hclust(d_dist_cosine, "centroid") plot(d_clust_centroid) rect.hclust(d_clust_centroid, k = 3) #ピアソンの積率相関係数による距離 d_dist_correlation <- dist(d, method="correlation") d_clust_centroid <- hclust(d_dist_correlation, "centroid") plot(d_clust_centroid) rect.hclust(d_clust_centroid, k = 3)
重み付き重心法(メディアン法) x さまざまな距離
# ユークリッド距離 d_dist_euclidean <- dist(d, method="Euclidean" ) d_clust_median <- hclust(d_dist_euclidean, "median") plot(d_clust_median) rect.hclust(d_clust_median, k = 3) #マハラノビス距離 d_dist_mahalanobis <- dist(d, method="Mahalanobis" ) d_clust_median <- hclust(d_dist_mahalanobis, "median") plot(d_clust_median) rect.hclust(d_clust_median, k = 3) #マンハッタン距離 d_dist_manhattan <- dist(d, method="Manhattan" ) d_clust_median <- hclust(d_dist_manhattan, "median") plot(d_clust_median) rect.hclust(d_clust_median, k = 3) #チェビシェフ距離 d_dist_chebyshev <- dist(d, method="Chebyshev" ) d_clust_median <- hclust(d_dist_chebyshev, "median") plot(d_clust_median) rect.hclust(d_clust_median, k = 3) #キャンベラ距離 d_dist_canberra <- dist(d, method="Canberra" ) d_clust_median <- hclust(d_dist_canberra, "median") plot(d_clust_median) rect.hclust(d_clust_median, k = 3) #拡張Jaccard類似度による距離 d_dist_ejaccard <- dist(d, method="eJaccard") d_clust_median <- hclust(d_dist_ejaccard, "median") plot(d_clust_median) rect.hclust(d_clust_median, k = 3) #余弦類似度による距離 d_dist_cosine <- dist(d, method="cosine") d_clust_median <- hclust(d_dist_cosine, "median") plot(d_clust_median) rect.hclust(d_clust_median, k = 3) #ピアソンの積率相関係数による距離 d_dist_correlation <- dist(d, method="correlation") d_clust_median <- hclust(d_dist_correlation, "median") plot(d_clust_median) rect.hclust(d_clust_median, k = 3)
par(par.save)