From 667e7881cc4ee6191e2eca4d8d46ec19bdaf38e9 Mon Sep 17 00:00:00 2001 From: tchemn Date: Mon, 4 Jun 2018 20:42:55 +0200 Subject: [PATCH] added two dimensional data analysis support -added kmeansMkI_2d -added calcdiff2d to dmlib -added plzGenNS and ageGenNS to dmtest to generate unshuffled testdata for kmeans 2d --- .gitignore | 6 ++ src/algorithms/dmlib.py | 43 ++++++----- src/algorithms/dmtest.py | 30 ++++++++ src/algorithms/kmeansMkI_2d.py | 133 +++++++++++++++++++++++++++++++++ 4 files changed, 195 insertions(+), 17 deletions(-) create mode 100644 src/algorithms/kmeansMkI_2d.py diff --git a/.gitignore b/.gitignore index 08a9f52..06254c0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,8 @@ testdata/ __pycache__/ +.idea/workspace.xml +.idea/vcs.xml +.idea/modules.xml +.idea/misc.xml +.idea/miner.iml +.idea/libraries/R_User_Library.xml diff --git a/src/algorithms/dmlib.py b/src/algorithms/dmlib.py index 8d7203e..c498b02 100644 --- a/src/algorithms/dmlib.py +++ b/src/algorithms/dmlib.py @@ -1,25 +1,34 @@ -# Calculate the difference between two points giving the indexes of these data entries -def calcdiff(point1, point2): - if int(point2) > int(point1): - difference = int(point2) - int(point1) - else: - difference = int(point1) - int(point2) - # print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference)) - return betrag(difference) +# Calculate the difference between two points giving the indexes of these xdata entries +import math +def calcdiff(point1, point2, data): + if int(point2) > int(point1): + difference = int(point2) - int(point1) + else: + difference = int(point1) - int(point2) + # print("Datapoint: " + str(xdata[point1]) + " | Cluster: " + str(xdata[point2]) + " | Difference: " + str(difference)) + return betrag(difference) + +def calcdiff2d(point1, point2): + point1 = [int(i) for i in point1] + point2 = [int(i) for i in point2] + difference = math.sqrt(((point2[0])-(point1[0]))**2+((point2[0])-(point1[0]))**2) + return betrag(difference) + # Get the absolute value of a number and returns it as int def betrag(number): - if number < 0: - number = int((-2 * number) / 2) - return number + if number < 0: + number = int((-2 * number) / 2) + return number + # Determine the highest int value in an array and returns is as an int def findHighest(data): - maximum = 0 - for i in range(0, len(data)): - if int(data[i]) > maximum: - maximum = int(data[i]) - return maximum + maximum = 0 + for i in range(0, len(data)): + if int(data[i]) > maximum: + maximum = int(data[i]) + return maximum def pp_calcdiff(data, clusterpoint): max_diff = 0 @@ -37,4 +46,4 @@ def pp_calcdiff_2(data, clusterpoint, clusterpoint_2): if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff: max_diff = calcdiff(data[item], clusterpoint) new_cluster = data[item] - return new_cluster \ No newline at end of file + return new_cluster diff --git a/src/algorithms/dmtest.py b/src/algorithms/dmtest.py index 900b69a..37c7dde 100644 --- a/src/algorithms/dmtest.py +++ b/src/algorithms/dmtest.py @@ -50,3 +50,33 @@ def numGen(entries, cluster, int_lenght): dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider])) shuffle(dataArray) return dataArray +# Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs +def plzGenNS(entries): + dataArray = [] + plz_lenght = 5 + for i in range(0, int(entries)): + if i < round(entries * 0.4): + plz = generateNumber(plz_lenght, 2) + elif i >= round(entries * 0.4) and i < round(entries * 0.8): + plz = generateNumber(plz_lenght, 6) + else: + plz = generateNumber(plz_lenght, randint(0, 9)) + dataArray.append(plz) + #i had to remove shuffle for the connectrion (age ==> plz) to work, else we would have 4 clusters + # shuffle(dataArray) + return dataArray # + + +def ageGenNS(entries): + dataArray = [] + age_lenght = 2 + for i in range(0, int(entries)): + if i < round(entries * 0.4): + age = generateNumber(age_lenght, 2) + elif i >= round(entries * 0.4) and i < round(entries * 0.8): + age = generateNumber(age_lenght, 5) + else: + age = generateNumber(age_lenght, randint(0, 9)) + dataArray.append(age) + # shuffle(dataArray) + return dataArray diff --git a/src/algorithms/kmeansMkI_2d.py b/src/algorithms/kmeansMkI_2d.py new file mode 100644 index 0000000..764e784 --- /dev/null +++ b/src/algorithms/kmeansMkI_2d.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python +# title: kmeansMkI.py +# description: Our personal Python K-Means++ implementation +# author: Tillmann Brendel, Conrad Großer +# license: Pending +# date: 04.06.2018 +# version: 1.5 +# usage: python pyscript.py +# notes: +# known_issues: +# python_version: 3.x +# ============================================================================== + +# IMPORTS + +# Importing the time for benchmarking purposes +import time +from datetime import date + +# For random generation of numbers import randint +from random import randint + +# Importing libary for multi core processing +import multiprocessing + +# Importing libaries for easy plotting +import numpy as np +import matplotlib.pyplot as plt + +# Importing own libaries Datamining Libary and Datamining Test +import dmlib +import dmtest + + + +# CODE +# Main function of the algorithm +def kmeansmk1(xdata, ydata, clusters): + # Defining cluster points + for i in range(0, clusters): + globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]] + print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)])) + #get max data in the data arrays + highpointx = dmlib.findHighest(xdata) + highpointy = dmlib.findHighest(ydata) + #print('highpoinx: ' + str(highpointx)) + #print('highpointy: ' + str(highpointy)) + + # Define variables for running the algorithm (runs is just as important as every other variable) + done = 0 + runs = 0 + + # As long as calcClusters returns done it will rearrange the clusters and assign the data to the clusters + while done == 0: + runs = runs + 1 + assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy) + #assigned_points consists of the clusternumbers + done = calcClusters(xdata, ydata, assigned_points, clusters) + + for i in range(0, clusters): + print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs") + for i in range(0, clusters): + plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro') + plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k') + plt.show() +# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster) +def calcClusters(xdata, ydata, assigned_points, clusters): + for cluster in range(0, clusters): + cpointunchanged = 1 + globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)] + clustersumx = 0 + clustersumy = 0 + count = 0 + #print('calcclusters running') + for item in range(0, len(xdata)): + if assigned_points[item] == cluster: + clustersumx = clustersumx + int(xdata[item]) + clustersumy = clustersumy + int(ydata[item]) + count = count + 1 + # print('item ' + str(item) +'done') + globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)] + #print('cluster ' + str(cluster) + 'done') + # checking if old clusterpoint is equal to the one just calculated + if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]: + cpointunchanged = 0 + + return cpointunchanged + + +def assignCluster(xdata, ydata, clusters, highpointx, highpointy): + data_assigned = [] + assigned_cluster = 0 + resetdist = dmlib.calcdiff2d([0,0],[highpointx, highpointy]) + #print('resetdist =' + str(resetdist)) + for item in range(0, len(xdata)): + olddistance = resetdist + for cluster in range(0, clusters): + distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]]) + # print('distance from point ' + str(item) + ' to cluster ' + str(cluster) + ': ' + str(distance)) + if distance < olddistance: + olddistance = distance + assigned_cluster = cluster + # print('cluster number ' + str(cluster) + ' assigned') + data_assigned.append(assigned_cluster) + # Add the assigned values list to the new_data array + #new_data.append(data_assigned) + + return data_assigned + + +# Startup function for collecting necesarry xdata +def startup(xdata, ydata): + # Using two clusters for testing + clusters = int(input("How many clusters are known? (hint: 2) ")) + # cores = input("How many cores should be used? ") + # path = input("Where is the xdata? ") or in this case xdata + + # For benchmarking starting the timer now + start_time = time.time() + + # Firing up the engines! + kmeansmk1(xdata, ydata, clusters) + + # Stopping benchmark + seconds = time.time() - start_time + print(str(seconds) + " seconds for execution") + + +# Start the algorithm and generate test xdata +xdata = dmtest.plzGenNS(1000) +ydata = dmtest.ageGenNS(1000) + +startup(xdata, ydata)