library(tidyverse) # Aufgabe 1 combine_df = read_csv('https://github.com/NikoStein/dma_data/raw/main/combine.csv') #a) Wählen Sie geeignete Variablen aus und clustern Sie die Daten mittels kMeans um # Positionen mit ähnlichen Anforderungen an die Spieler zu identifizieren. combine_df %>% ggplot(aes(x=weight_kg, y=bench, color = position, alpha = 0.1)) + geom_point() + theme_bw() combine_df %>% select(-bench) %>% na.omit() -> combine_df combine_df %>% select(height_in, weight_kg, vertical_cm, broad_jump_cm, x3cone, shuttle, x40yd) %>% scale() %>% kmeans(centers = 3) -> clustering combine_df$cluster = clustering$cluster combine_df %>% group_by(cluster, position) %>% summarise(count = n()) %>% ggplot(aes(x=position, y=as.factor(cluster), size=count)) + geom_point() + xlab('Position') + ylab("Cluster") + theme_bw() #b) combine_df %>% select(height_in, weight_kg, vertical_cm, broad_jump_cm, x3cone, shuttle, x40yd) %>% scale() %>% dist() -> dist_mat hclust_avg = hclust(dist_mat, method = 'complete') plot(hclust_avg) cut_avg = cutree(hclust_avg, k=4) combine_df$cluster = cut_avg combine_df %>% group_by(cluster, position) %>% summarise(count = n()) %>% ggplot(aes(x=position, y=as.factor(cluster), size=count)) + geom_point() + xlab('Position') + ylab("Cluster") + theme_bw() # Aufgabe 2 mall_df = read_csv('https://github.com/NikoStein/dma_data/raw/main/Mall_Customers.csv') # a) Clustern Sie die Kunden mittels des kMeans Algorithmus. mall_df %>% select(-CustomerID) %>% mutate(Gender = if_else(Gender == 'Male', 0, 1)) %>% scale() %>% kmeans(centers = 3) -> clusters clusters # b) Testen Sie verschiedene Anzahlen an Clustern. Welche Anzahl erscheint Ihnen sinnvoll? cluster_data = NULL for (i in 2:20){ mall_df %>% select(-CustomerID) %>% mutate(Gender = if_else(Gender == 'Male', 0, 1)) %>% scale() %>% kmeans(centers = i) -> clusters cluster_data %>% rbind(data.frame(centers=i, between=clusters$betweenss, within=clusters$tot.withinss)) -> cluster_data } cluster_data %>% ggplot(aes(x=centers)) + geom_line(aes(y=between), color='red') + geom_line(aes(y=within), color='blue') + theme_bw()