added two dimensional data analysis support

-added kmeansMkI_2d
-added calcdiff2d to dmlib
-added plzGenNS and ageGenNS  to dmtest to generate unshuffled testdata for kmeans 2d
This commit is contained in:
tchemn
2018-06-04 20:42:55 +02:00
parent aa43c93ae5
commit 667e7881cc
4 changed files with 195 additions and 17 deletions

6
.gitignore vendored
View File

@@ -1,2 +1,8 @@
testdata/ testdata/
__pycache__/ __pycache__/
.idea/workspace.xml
.idea/vcs.xml
.idea/modules.xml
.idea/misc.xml
.idea/miner.iml
.idea/libraries/R_User_Library.xml

View File

@@ -1,25 +1,34 @@
# Calculate the difference between two points giving the indexes of these data entries # Calculate the difference between two points giving the indexes of these xdata entries
def calcdiff(point1, point2): import math
if int(point2) > int(point1): def calcdiff(point1, point2, data):
difference = int(point2) - int(point1) if int(point2) > int(point1):
else: difference = int(point2) - int(point1)
difference = int(point1) - int(point2) else:
# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference)) difference = int(point1) - int(point2)
return betrag(difference) # print("Datapoint: " + str(xdata[point1]) + " | Cluster: " + str(xdata[point2]) + " | Difference: " + str(difference))
return betrag(difference)
def calcdiff2d(point1, point2):
point1 = [int(i) for i in point1]
point2 = [int(i) for i in point2]
difference = math.sqrt(((point2[0])-(point1[0]))**2+((point2[0])-(point1[0]))**2)
return betrag(difference)
# Get the absolute value of a number and returns it as int # Get the absolute value of a number and returns it as int
def betrag(number): def betrag(number):
if number < 0: if number < 0:
number = int((-2 * number) / 2) number = int((-2 * number) / 2)
return number return number
# Determine the highest int value in an array and returns is as an int # Determine the highest int value in an array and returns is as an int
def findHighest(data): def findHighest(data):
maximum = 0 maximum = 0
for i in range(0, len(data)): for i in range(0, len(data)):
if int(data[i]) > maximum: if int(data[i]) > maximum:
maximum = int(data[i]) maximum = int(data[i])
return maximum return maximum
def pp_calcdiff(data, clusterpoint): def pp_calcdiff(data, clusterpoint):
max_diff = 0 max_diff = 0
@@ -37,4 +46,4 @@ def pp_calcdiff_2(data, clusterpoint, clusterpoint_2):
if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff: if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
max_diff = calcdiff(data[item], clusterpoint) max_diff = calcdiff(data[item], clusterpoint)
new_cluster = data[item] new_cluster = data[item]
return new_cluster return new_cluster

View File

@@ -50,3 +50,33 @@ def numGen(entries, cluster, int_lenght):
dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider])) dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider]))
shuffle(dataArray) shuffle(dataArray)
return dataArray return dataArray
# Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs
def plzGenNS(entries):
dataArray = []
plz_lenght = 5
for i in range(0, int(entries)):
if i < round(entries * 0.4):
plz = generateNumber(plz_lenght, 2)
elif i >= round(entries * 0.4) and i < round(entries * 0.8):
plz = generateNumber(plz_lenght, 6)
else:
plz = generateNumber(plz_lenght, randint(0, 9))
dataArray.append(plz)
#i had to remove shuffle for the connectrion (age ==> plz) to work, else we would have 4 clusters
# shuffle(dataArray)
return dataArray #
def ageGenNS(entries):
dataArray = []
age_lenght = 2
for i in range(0, int(entries)):
if i < round(entries * 0.4):
age = generateNumber(age_lenght, 2)
elif i >= round(entries * 0.4) and i < round(entries * 0.8):
age = generateNumber(age_lenght, 5)
else:
age = generateNumber(age_lenght, randint(0, 9))
dataArray.append(age)
# shuffle(dataArray)
return dataArray

View File

@@ -0,0 +1,133 @@
#!/usr/bin/env python
# title: kmeansMkI.py
# description: Our personal Python K-Means++ implementation
# author: Tillmann Brendel, Conrad Großer
# license: Pending
# date: 04.06.2018
# version: 1.5
# usage: python pyscript.py
# notes:
# known_issues:
# python_version: 3.x
# ==============================================================================
# IMPORTS
# Importing the time for benchmarking purposes
import time
from datetime import date
# For random generation of numbers import randint
from random import randint
# Importing libary for multi core processing
import multiprocessing
# Importing libaries for easy plotting
import numpy as np
import matplotlib.pyplot as plt
# Importing own libaries Datamining Libary and Datamining Test
import dmlib
import dmtest
# CODE
# Main function of the algorithm
def kmeansmk1(xdata, ydata, clusters):
# Defining cluster points
for i in range(0, clusters):
globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]]
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
#get max data in the data arrays
highpointx = dmlib.findHighest(xdata)
highpointy = dmlib.findHighest(ydata)
#print('highpoinx: ' + str(highpointx))
#print('highpointy: ' + str(highpointy))
# Define variables for running the algorithm (runs is just as important as every other variable)
done = 0
runs = 0
# As long as calcClusters returns done it will rearrange the clusters and assign the data to the clusters
while done == 0:
runs = runs + 1
assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy)
#assigned_points consists of the clusternumbers
done = calcClusters(xdata, ydata, assigned_points, clusters)
for i in range(0, clusters):
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
for i in range(0, clusters):
plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro')
plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k')
plt.show()
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(xdata, ydata, assigned_points, clusters):
for cluster in range(0, clusters):
cpointunchanged = 1
globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)]
clustersumx = 0
clustersumy = 0
count = 0
#print('calcclusters running')
for item in range(0, len(xdata)):
if assigned_points[item] == cluster:
clustersumx = clustersumx + int(xdata[item])
clustersumy = clustersumy + int(ydata[item])
count = count + 1
# print('item ' + str(item) +'done')
globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)]
#print('cluster ' + str(cluster) + 'done')
# checking if old clusterpoint is equal to the one just calculated
if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]:
cpointunchanged = 0
return cpointunchanged
def assignCluster(xdata, ydata, clusters, highpointx, highpointy):
data_assigned = []
assigned_cluster = 0
resetdist = dmlib.calcdiff2d([0,0],[highpointx, highpointy])
#print('resetdist =' + str(resetdist))
for item in range(0, len(xdata)):
olddistance = resetdist
for cluster in range(0, clusters):
distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]])
# print('distance from point ' + str(item) + ' to cluster ' + str(cluster) + ': ' + str(distance))
if distance < olddistance:
olddistance = distance
assigned_cluster = cluster
# print('cluster number ' + str(cluster) + ' assigned')
data_assigned.append(assigned_cluster)
# Add the assigned values list to the new_data array
#new_data.append(data_assigned)
return data_assigned
# Startup function for collecting necesarry xdata
def startup(xdata, ydata):
# Using two clusters for testing
clusters = int(input("How many clusters are known? (hint: 2) "))
# cores = input("How many cores should be used? ")
# path = input("Where is the xdata? ") or in this case xdata
# For benchmarking starting the timer now
start_time = time.time()
# Firing up the engines!
kmeansmk1(xdata, ydata, clusters)
# Stopping benchmark
seconds = time.time() - start_time
print(str(seconds) + " seconds for execution")
# Start the algorithm and generate test xdata
xdata = dmtest.plzGenNS(1000)
ydata = dmtest.ageGenNS(1000)
startup(xdata, ydata)