added two dimensional data analysis support

-added kmeansMkI_2d -added calcdiff2d to dmlib -added plzGenNS and ageGenNS to dmtest to generate unshuffled testdata for kmeans 2d
2018-06-04 20:42:55 +02:00
parent aa43c93ae5
commit 667e7881cc
4 changed files with 195 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,8 @@
 testdata/
 __pycache__/
 .idea/workspace.xml
 .idea/vcs.xml
 .idea/modules.xml
 .idea/misc.xml
 .idea/miner.iml
 .idea/libraries/R_User_Library.xml
--- a/src/algorithms/dmlib.py
+++ b/src/algorithms/dmlib.py
@@ -1,25 +1,34 @@
-# Calculate the difference between two points giving the indexes of these data entries
+# Calculate the difference between two points giving the indexes of these xdata entries
-def calcdiff(point1, point2):
+import math
-	if int(point2) > int(point1):
+def calcdiff(point1, point2, data):
-		difference = int(point2) - int(point1)
+    if int(point2) > int(point1):
-	else:
+        difference = int(point2) - int(point1)
-		difference = int(point1) - int(point2)
+    else:
-	# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
+        difference = int(point1) - int(point2)
-	return betrag(difference)
+    # print("Datapoint: " + str(xdata[point1]) + " | Cluster: " + str(xdata[point2]) + " | Difference: " + str(difference))
    return betrag(difference)
 def calcdiff2d(point1, point2):
    point1 = [int(i) for i in point1]
    point2 = [int(i) for i in point2]
    difference = math.sqrt(((point2[0])-(point1[0]))**2+((point2[0])-(point1[0]))**2)
    return betrag(difference)
 # Get the absolute value of a number and returns it as int
 def betrag(number):
-	if number < 0:
+    if number < 0:
-		number = int((-2 * number) / 2)
+        number = int((-2 * number) / 2)
-	return number
+    return number
 # Determine the highest int value in an array and returns is as an int
 def findHighest(data):
-	maximum = 0
+    maximum = 0
-	for i in range(0, len(data)):
+    for i in range(0, len(data)):
-		if int(data[i]) > maximum:
+        if int(data[i]) > maximum:
-			maximum = int(data[i])
+            maximum = int(data[i])
-	return maximum
+    return maximum
 def pp_calcdiff(data, clusterpoint):
 	max_diff = 0
@@ -37,4 +46,4 @@ def pp_calcdiff_2(data, clusterpoint, clusterpoint_2):
 		if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
 			max_diff = calcdiff(data[item], clusterpoint)
 			new_cluster = data[item]
-	return new_cluster
+	return new_cluster
--- a/src/algorithms/dmtest.py
+++ b/src/algorithms/dmtest.py
@@ -50,3 +50,33 @@ def numGen(entries, cluster, int_lenght):
 			dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider]))
 	shuffle(dataArray)
 	return dataArray
 # Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs
 def plzGenNS(entries):
    dataArray = []
    plz_lenght = 5
    for i in range(0, int(entries)):
        if i < round(entries * 0.4):
            plz = generateNumber(plz_lenght, 2)
        elif i >= round(entries * 0.4) and i < round(entries * 0.8):
            plz = generateNumber(plz_lenght, 6)
        else:
            plz = generateNumber(plz_lenght, randint(0, 9))
        dataArray.append(plz)
    #i had to remove shuffle for the connectrion (age ==> plz) to work, else we would have 4 clusters
    # shuffle(dataArray)
    return dataArray  #
 def ageGenNS(entries):
    dataArray = []
    age_lenght = 2
    for i in range(0, int(entries)):
        if i < round(entries * 0.4):
            age = generateNumber(age_lenght, 2)
        elif i >= round(entries * 0.4) and i < round(entries * 0.8):
            age = generateNumber(age_lenght, 5)
        else:
            age = generateNumber(age_lenght, randint(0, 9))
        dataArray.append(age)
    # shuffle(dataArray)
    return dataArray
--- a/src/algorithms/kmeansMkI_2d.py
+++ b/src/algorithms/kmeansMkI_2d.py
@@ -0,0 +1,133 @@
 #!/usr/bin/env python
 # title:				kmeansMkI.py
 # description:		Our personal Python K-Means++ implementation
 # author:			Tillmann Brendel, Conrad Großer
 # license:			Pending
 # date:				04.06.2018
 # version:			1.5
 # usage:				python pyscript.py
 # notes:
 # known_issues:
 # python_version:	3.x
 # ==============================================================================
 # IMPORTS
 # Importing the time for benchmarking purposes
 import time
 from datetime import date
 # For random generation of numbers import randint
 from random import randint
 # Importing libary for multi core processing
 import multiprocessing
 # Importing libaries for easy plotting
 import numpy as np
 import matplotlib.pyplot as plt
 # Importing own libaries Datamining Libary and Datamining Test
 import dmlib
 import dmtest
 # CODE
 # Main function of the algorithm
 def kmeansmk1(xdata, ydata, clusters):
    # Defining cluster points
    for i in range(0, clusters):
        globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]]
        print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
    #get max data in the data arrays
    highpointx = dmlib.findHighest(xdata)
    highpointy = dmlib.findHighest(ydata)
    #print('highpoinx: ' + str(highpointx))
    #print('highpointy: ' + str(highpointy))
    # Define variables for running the algorithm (runs is just as important as every other variable)
    done = 0
    runs = 0
    # As long as calcClusters returns done it will rearrange the clusters and assign the data to the clusters
    while done == 0:
        runs = runs + 1
        assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy)
        #assigned_points consists of the clusternumbers
        done = calcClusters(xdata, ydata, assigned_points, clusters)
    for i in range(0, clusters):
        print("Endcluster " + str(i + 1) + " is calculated to be at  " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
    for i in range(0, clusters):
        plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro')
    plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k')
    plt.show()
 # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
 def calcClusters(xdata, ydata, assigned_points, clusters):
    for cluster in range(0, clusters):
        cpointunchanged = 1
        globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)]
        clustersumx = 0
        clustersumy = 0
        count = 0
        #print('calcclusters running')
        for item in range(0, len(xdata)):
            if assigned_points[item] == cluster:
                clustersumx = clustersumx + int(xdata[item])
                clustersumy = clustersumy + int(ydata[item])
                count = count + 1
           # print('item ' + str(item) +'done')
        globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)]
        #print('cluster ' + str(cluster) + 'done')
        # checking if old clusterpoint is equal to the one just calculated
        if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]:
            cpointunchanged = 0
    return cpointunchanged
 def assignCluster(xdata, ydata, clusters, highpointx, highpointy):
    data_assigned = []
    assigned_cluster = 0
    resetdist = dmlib.calcdiff2d([0,0],[highpointx, highpointy])
    #print('resetdist =' + str(resetdist))
    for item in range(0, len(xdata)):
        olddistance = resetdist
        for cluster in range(0, clusters):
            distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]])
           # print('distance from point ' + str(item) + ' to cluster ' + str(cluster) + ': ' + str(distance))
            if distance < olddistance:
                olddistance = distance
                assigned_cluster = cluster
       # print('cluster number ' + str(cluster) + ' assigned')
        data_assigned.append(assigned_cluster)
    # Add the assigned values list to the new_data array
    #new_data.append(data_assigned)
    return data_assigned
 # Startup function for collecting necesarry xdata
 def startup(xdata, ydata):
    # Using two clusters for testing
    clusters = int(input("How many clusters are known? (hint: 2) "))
    # cores = input("How many cores should be used? ")
    # path = input("Where is the xdata? ") or in this case xdata
    # For benchmarking starting the timer now
    start_time = time.time()
    # Firing up the engines!
    kmeansmk1(xdata, ydata, clusters)
    # Stopping benchmark
    seconds = time.time() - start_time
    print(str(seconds) + " seconds for execution")
 # Start the algorithm and generate test xdata
 xdata = dmtest.plzGenNS(1000)
 ydata = dmtest.ageGenNS(1000)
 startup(xdata, ydata)