added two dimensional data analysis support
-added kmeansMkI_2d -added calcdiff2d to dmlib -added plzGenNS and ageGenNS to dmtest to generate unshuffled testdata for kmeans 2d
This commit is contained in:
6
.gitignore
vendored
6
.gitignore
vendored
@@ -1,2 +1,8 @@
|
|||||||
testdata/
|
testdata/
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
.idea/workspace.xml
|
||||||
|
.idea/vcs.xml
|
||||||
|
.idea/modules.xml
|
||||||
|
.idea/misc.xml
|
||||||
|
.idea/miner.iml
|
||||||
|
.idea/libraries/R_User_Library.xml
|
||||||
|
|||||||
@@ -1,25 +1,34 @@
|
|||||||
# Calculate the difference between two points giving the indexes of these data entries
|
# Calculate the difference between two points giving the indexes of these xdata entries
|
||||||
def calcdiff(point1, point2):
|
import math
|
||||||
if int(point2) > int(point1):
|
def calcdiff(point1, point2, data):
|
||||||
difference = int(point2) - int(point1)
|
if int(point2) > int(point1):
|
||||||
else:
|
difference = int(point2) - int(point1)
|
||||||
difference = int(point1) - int(point2)
|
else:
|
||||||
# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
|
difference = int(point1) - int(point2)
|
||||||
return betrag(difference)
|
# print("Datapoint: " + str(xdata[point1]) + " | Cluster: " + str(xdata[point2]) + " | Difference: " + str(difference))
|
||||||
|
return betrag(difference)
|
||||||
|
|
||||||
|
def calcdiff2d(point1, point2):
|
||||||
|
point1 = [int(i) for i in point1]
|
||||||
|
point2 = [int(i) for i in point2]
|
||||||
|
difference = math.sqrt(((point2[0])-(point1[0]))**2+((point2[0])-(point1[0]))**2)
|
||||||
|
return betrag(difference)
|
||||||
|
|
||||||
|
|
||||||
# Get the absolute value of a number and returns it as int
|
# Get the absolute value of a number and returns it as int
|
||||||
def betrag(number):
|
def betrag(number):
|
||||||
if number < 0:
|
if number < 0:
|
||||||
number = int((-2 * number) / 2)
|
number = int((-2 * number) / 2)
|
||||||
return number
|
return number
|
||||||
|
|
||||||
|
|
||||||
# Determine the highest int value in an array and returns is as an int
|
# Determine the highest int value in an array and returns is as an int
|
||||||
def findHighest(data):
|
def findHighest(data):
|
||||||
maximum = 0
|
maximum = 0
|
||||||
for i in range(0, len(data)):
|
for i in range(0, len(data)):
|
||||||
if int(data[i]) > maximum:
|
if int(data[i]) > maximum:
|
||||||
maximum = int(data[i])
|
maximum = int(data[i])
|
||||||
return maximum
|
return maximum
|
||||||
|
|
||||||
def pp_calcdiff(data, clusterpoint):
|
def pp_calcdiff(data, clusterpoint):
|
||||||
max_diff = 0
|
max_diff = 0
|
||||||
@@ -37,4 +46,4 @@ def pp_calcdiff_2(data, clusterpoint, clusterpoint_2):
|
|||||||
if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
|
if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
|
||||||
max_diff = calcdiff(data[item], clusterpoint)
|
max_diff = calcdiff(data[item], clusterpoint)
|
||||||
new_cluster = data[item]
|
new_cluster = data[item]
|
||||||
return new_cluster
|
return new_cluster
|
||||||
|
|||||||
@@ -50,3 +50,33 @@ def numGen(entries, cluster, int_lenght):
|
|||||||
dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider]))
|
dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider]))
|
||||||
shuffle(dataArray)
|
shuffle(dataArray)
|
||||||
return dataArray
|
return dataArray
|
||||||
|
# Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs
|
||||||
|
def plzGenNS(entries):
|
||||||
|
dataArray = []
|
||||||
|
plz_lenght = 5
|
||||||
|
for i in range(0, int(entries)):
|
||||||
|
if i < round(entries * 0.4):
|
||||||
|
plz = generateNumber(plz_lenght, 2)
|
||||||
|
elif i >= round(entries * 0.4) and i < round(entries * 0.8):
|
||||||
|
plz = generateNumber(plz_lenght, 6)
|
||||||
|
else:
|
||||||
|
plz = generateNumber(plz_lenght, randint(0, 9))
|
||||||
|
dataArray.append(plz)
|
||||||
|
#i had to remove shuffle for the connectrion (age ==> plz) to work, else we would have 4 clusters
|
||||||
|
# shuffle(dataArray)
|
||||||
|
return dataArray #
|
||||||
|
|
||||||
|
|
||||||
|
def ageGenNS(entries):
|
||||||
|
dataArray = []
|
||||||
|
age_lenght = 2
|
||||||
|
for i in range(0, int(entries)):
|
||||||
|
if i < round(entries * 0.4):
|
||||||
|
age = generateNumber(age_lenght, 2)
|
||||||
|
elif i >= round(entries * 0.4) and i < round(entries * 0.8):
|
||||||
|
age = generateNumber(age_lenght, 5)
|
||||||
|
else:
|
||||||
|
age = generateNumber(age_lenght, randint(0, 9))
|
||||||
|
dataArray.append(age)
|
||||||
|
# shuffle(dataArray)
|
||||||
|
return dataArray
|
||||||
|
|||||||
133
src/algorithms/kmeansMkI_2d.py
Normal file
133
src/algorithms/kmeansMkI_2d.py
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# title: kmeansMkI.py
|
||||||
|
# description: Our personal Python K-Means++ implementation
|
||||||
|
# author: Tillmann Brendel, Conrad Großer
|
||||||
|
# license: Pending
|
||||||
|
# date: 04.06.2018
|
||||||
|
# version: 1.5
|
||||||
|
# usage: python pyscript.py
|
||||||
|
# notes:
|
||||||
|
# known_issues:
|
||||||
|
# python_version: 3.x
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
# IMPORTS
|
||||||
|
|
||||||
|
# Importing the time for benchmarking purposes
|
||||||
|
import time
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
|
# For random generation of numbers import randint
|
||||||
|
from random import randint
|
||||||
|
|
||||||
|
# Importing libary for multi core processing
|
||||||
|
import multiprocessing
|
||||||
|
|
||||||
|
# Importing libaries for easy plotting
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
# Importing own libaries Datamining Libary and Datamining Test
|
||||||
|
import dmlib
|
||||||
|
import dmtest
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# CODE
|
||||||
|
# Main function of the algorithm
|
||||||
|
def kmeansmk1(xdata, ydata, clusters):
|
||||||
|
# Defining cluster points
|
||||||
|
for i in range(0, clusters):
|
||||||
|
globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]]
|
||||||
|
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
|
||||||
|
#get max data in the data arrays
|
||||||
|
highpointx = dmlib.findHighest(xdata)
|
||||||
|
highpointy = dmlib.findHighest(ydata)
|
||||||
|
#print('highpoinx: ' + str(highpointx))
|
||||||
|
#print('highpointy: ' + str(highpointy))
|
||||||
|
|
||||||
|
# Define variables for running the algorithm (runs is just as important as every other variable)
|
||||||
|
done = 0
|
||||||
|
runs = 0
|
||||||
|
|
||||||
|
# As long as calcClusters returns done it will rearrange the clusters and assign the data to the clusters
|
||||||
|
while done == 0:
|
||||||
|
runs = runs + 1
|
||||||
|
assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy)
|
||||||
|
#assigned_points consists of the clusternumbers
|
||||||
|
done = calcClusters(xdata, ydata, assigned_points, clusters)
|
||||||
|
|
||||||
|
for i in range(0, clusters):
|
||||||
|
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
|
||||||
|
for i in range(0, clusters):
|
||||||
|
plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro')
|
||||||
|
plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k')
|
||||||
|
plt.show()
|
||||||
|
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
|
||||||
|
def calcClusters(xdata, ydata, assigned_points, clusters):
|
||||||
|
for cluster in range(0, clusters):
|
||||||
|
cpointunchanged = 1
|
||||||
|
globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)]
|
||||||
|
clustersumx = 0
|
||||||
|
clustersumy = 0
|
||||||
|
count = 0
|
||||||
|
#print('calcclusters running')
|
||||||
|
for item in range(0, len(xdata)):
|
||||||
|
if assigned_points[item] == cluster:
|
||||||
|
clustersumx = clustersumx + int(xdata[item])
|
||||||
|
clustersumy = clustersumy + int(ydata[item])
|
||||||
|
count = count + 1
|
||||||
|
# print('item ' + str(item) +'done')
|
||||||
|
globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)]
|
||||||
|
#print('cluster ' + str(cluster) + 'done')
|
||||||
|
# checking if old clusterpoint is equal to the one just calculated
|
||||||
|
if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]:
|
||||||
|
cpointunchanged = 0
|
||||||
|
|
||||||
|
return cpointunchanged
|
||||||
|
|
||||||
|
|
||||||
|
def assignCluster(xdata, ydata, clusters, highpointx, highpointy):
|
||||||
|
data_assigned = []
|
||||||
|
assigned_cluster = 0
|
||||||
|
resetdist = dmlib.calcdiff2d([0,0],[highpointx, highpointy])
|
||||||
|
#print('resetdist =' + str(resetdist))
|
||||||
|
for item in range(0, len(xdata)):
|
||||||
|
olddistance = resetdist
|
||||||
|
for cluster in range(0, clusters):
|
||||||
|
distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]])
|
||||||
|
# print('distance from point ' + str(item) + ' to cluster ' + str(cluster) + ': ' + str(distance))
|
||||||
|
if distance < olddistance:
|
||||||
|
olddistance = distance
|
||||||
|
assigned_cluster = cluster
|
||||||
|
# print('cluster number ' + str(cluster) + ' assigned')
|
||||||
|
data_assigned.append(assigned_cluster)
|
||||||
|
# Add the assigned values list to the new_data array
|
||||||
|
#new_data.append(data_assigned)
|
||||||
|
|
||||||
|
return data_assigned
|
||||||
|
|
||||||
|
|
||||||
|
# Startup function for collecting necesarry xdata
|
||||||
|
def startup(xdata, ydata):
|
||||||
|
# Using two clusters for testing
|
||||||
|
clusters = int(input("How many clusters are known? (hint: 2) "))
|
||||||
|
# cores = input("How many cores should be used? ")
|
||||||
|
# path = input("Where is the xdata? ") or in this case xdata
|
||||||
|
|
||||||
|
# For benchmarking starting the timer now
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Firing up the engines!
|
||||||
|
kmeansmk1(xdata, ydata, clusters)
|
||||||
|
|
||||||
|
# Stopping benchmark
|
||||||
|
seconds = time.time() - start_time
|
||||||
|
print(str(seconds) + " seconds for execution")
|
||||||
|
|
||||||
|
|
||||||
|
# Start the algorithm and generate test xdata
|
||||||
|
xdata = dmtest.plzGenNS(1000)
|
||||||
|
ydata = dmtest.ageGenNS(1000)
|
||||||
|
|
||||||
|
startup(xdata, ydata)
|
||||||
Reference in New Issue
Block a user