uno

 

 

 

 





 

 


Analisis de Cluster

Cargamos la muy conocidad data iris

data(iris)
datos <- iris[,-5]
head(datos)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4

realizamos un escalamiento de datos

datos.escalados<- scale(datos)
head(datos.escalados)
##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]   -0.8976739  1.01560199    -1.335752   -1.311052
## [2,]   -1.1392005 -0.13153881    -1.335752   -1.311052
## [3,]   -1.3807271  0.32731751    -1.392399   -1.311052
## [4,]   -1.5014904  0.09788935    -1.279104   -1.311052
## [5,]   -1.0184372  1.24503015    -1.335752   -1.311052
## [6,]   -0.5353840  1.93331463    -1.165809   -1.048667

Cluster Jerarquico

distancias <- dist(datos.escalados) #hallamos la distancias
c.jerarquico <- hclust(distancias)
grupos.cluster <- cutree(c.jerarquico, k = 3)
grupos.cluster
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 2 3 2 2 3 2 3 3 3 3 2 2 2
##  [71] 3 3 3 3 3 3 3 3 3 2 2 2 2 3 3 3 3 2 3 2 2 3 2 2 2 3 3 3 2 2 3 3 3 3 3
## [106] 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [141] 3 3 3 3 3 3 3 3 3 3
head(data.frame(datos.escalados, grupos.cluster))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width grupos.cluster
## 1   -0.8976739  1.01560199    -1.335752   -1.311052              1
## 2   -1.1392005 -0.13153881    -1.335752   -1.311052              1
## 3   -1.3807271  0.32731751    -1.392399   -1.311052              1
## 4   -1.5014904  0.09788935    -1.279104   -1.311052              1
## 5   -1.0184372  1.24503015    -1.335752   -1.311052              1
## 6   -0.5353840  1.93331463    -1.165809   -1.048667              1
plot(c.jerarquico, main = "Cluster")
rect.hclust(c.jerarquico, k = 5, border = "red")

Cluster K-Means

c.kmeans <- kmeans(datos.escalados,3)
c.kmeans$cluster #grupo de cluster clasificado a cada elemento
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1
##  [71] 2 1 1 1 1 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2
## [106] 2 1 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 2 2 2 2 2 2 1 1 2 2 2 1 2
## [141] 2 2 1 2 2 2 1 2 2 1
head(data.frame(datos.escalados,cluster=c.kmeans$cluster))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width cluster
## 1   -0.8976739  1.01560199    -1.335752   -1.311052       3
## 2   -1.1392005 -0.13153881    -1.335752   -1.311052       3
## 3   -1.3807271  0.32731751    -1.392399   -1.311052       3
## 4   -1.5014904  0.09788935    -1.279104   -1.311052       3
## 5   -1.0184372  1.24503015    -1.335752   -1.311052       3
## 6   -0.5353840  1.93331463    -1.165809   -1.048667       3

Visualiza el cluster de pertenencia de las observaciones

data1<-data.frame(data,cluster=res$cluster)

tamaño de cada cluster

res$size

etiquetar cada cluster

data.x <- subset(data1, select= -cluster) data.c <- subset(data1, select= cluster) aggregate(data.x, data.c, mean)

install.packages(“doBy”) library(doBy) summaryBy(V1+V2+V3+V4+V5+V6 ~ cluster, data=data1, FUN=mean)

install.packages(“sqldf”) library(sqldf) sqldf(“select cluster, avg(V1), avg(V2), avg(v3), avg(v4), avg(v5), avg(V6) from data1 group by cluster”)

Elección del número de clúster

Silueta

library(cluster) diss.data=daisy(scale(data[,-1])) par(mfrow=c(1,3)) for(h in 2:4){ res=kmeans(scale(data[,-1]),h) plot(silhouette(res$cluster,diss.data)) } par(mfrow=c(1,1))

Visualización de Clúster

res <- kmeans(scale(data[,-1]),3)

clusplot(data[,-1],res$cluster, color = TRUE, shade = TRUE, labels =2,lines=0, main =“Gráfico de Conglomerados”)