added two dimensional data analysis support
-added kmeansMkI_2d -added calcdiff2d to dmlib -added plzGenNS and ageGenNS to dmtest to generate unshuffled testdata for kmeans 2d
This commit is contained in:
6
.gitignore
vendored
6
.gitignore
vendored
@@ -1,2 +1,8 @@
|
||||
testdata/
|
||||
__pycache__/
|
||||
.idea/workspace.xml
|
||||
.idea/vcs.xml
|
||||
.idea/modules.xml
|
||||
.idea/misc.xml
|
||||
.idea/miner.iml
|
||||
.idea/libraries/R_User_Library.xml
|
||||
|
||||
@@ -1,18 +1,27 @@
|
||||
# Calculate the difference between two points giving the indexes of these data entries
|
||||
def calcdiff(point1, point2):
|
||||
# Calculate the difference between two points giving the indexes of these xdata entries
|
||||
import math
|
||||
def calcdiff(point1, point2, data):
|
||||
if int(point2) > int(point1):
|
||||
difference = int(point2) - int(point1)
|
||||
else:
|
||||
difference = int(point1) - int(point2)
|
||||
# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
|
||||
# print("Datapoint: " + str(xdata[point1]) + " | Cluster: " + str(xdata[point2]) + " | Difference: " + str(difference))
|
||||
return betrag(difference)
|
||||
|
||||
def calcdiff2d(point1, point2):
|
||||
point1 = [int(i) for i in point1]
|
||||
point2 = [int(i) for i in point2]
|
||||
difference = math.sqrt(((point2[0])-(point1[0]))**2+((point2[0])-(point1[0]))**2)
|
||||
return betrag(difference)
|
||||
|
||||
|
||||
# Get the absolute value of a number and returns it as int
|
||||
def betrag(number):
|
||||
if number < 0:
|
||||
number = int((-2 * number) / 2)
|
||||
return number
|
||||
|
||||
|
||||
# Determine the highest int value in an array and returns is as an int
|
||||
def findHighest(data):
|
||||
maximum = 0
|
||||
|
||||
@@ -50,3 +50,33 @@ def numGen(entries, cluster, int_lenght):
|
||||
dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider]))
|
||||
shuffle(dataArray)
|
||||
return dataArray
|
||||
# Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs
|
||||
def plzGenNS(entries):
|
||||
dataArray = []
|
||||
plz_lenght = 5
|
||||
for i in range(0, int(entries)):
|
||||
if i < round(entries * 0.4):
|
||||
plz = generateNumber(plz_lenght, 2)
|
||||
elif i >= round(entries * 0.4) and i < round(entries * 0.8):
|
||||
plz = generateNumber(plz_lenght, 6)
|
||||
else:
|
||||
plz = generateNumber(plz_lenght, randint(0, 9))
|
||||
dataArray.append(plz)
|
||||
#i had to remove shuffle for the connectrion (age ==> plz) to work, else we would have 4 clusters
|
||||
# shuffle(dataArray)
|
||||
return dataArray #
|
||||
|
||||
|
||||
def ageGenNS(entries):
|
||||
dataArray = []
|
||||
age_lenght = 2
|
||||
for i in range(0, int(entries)):
|
||||
if i < round(entries * 0.4):
|
||||
age = generateNumber(age_lenght, 2)
|
||||
elif i >= round(entries * 0.4) and i < round(entries * 0.8):
|
||||
age = generateNumber(age_lenght, 5)
|
||||
else:
|
||||
age = generateNumber(age_lenght, randint(0, 9))
|
||||
dataArray.append(age)
|
||||
# shuffle(dataArray)
|
||||
return dataArray
|
||||
|
||||
133
src/algorithms/kmeansMkI_2d.py
Normal file
133
src/algorithms/kmeansMkI_2d.py
Normal file
@@ -0,0 +1,133 @@
|
||||
#!/usr/bin/env python
|
||||
# title: kmeansMkI.py
|
||||
# description: Our personal Python K-Means++ implementation
|
||||
# author: Tillmann Brendel, Conrad Großer
|
||||
# license: Pending
|
||||
# date: 04.06.2018
|
||||
# version: 1.5
|
||||
# usage: python pyscript.py
|
||||
# notes:
|
||||
# known_issues:
|
||||
# python_version: 3.x
|
||||
# ==============================================================================
|
||||
|
||||
# IMPORTS
|
||||
|
||||
# Importing the time for benchmarking purposes
|
||||
import time
|
||||
from datetime import date
|
||||
|
||||
# For random generation of numbers import randint
|
||||
from random import randint
|
||||
|
||||
# Importing libary for multi core processing
|
||||
import multiprocessing
|
||||
|
||||
# Importing libaries for easy plotting
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# Importing own libaries Datamining Libary and Datamining Test
|
||||
import dmlib
|
||||
import dmtest
|
||||
|
||||
|
||||
|
||||
# CODE
|
||||
# Main function of the algorithm
|
||||
def kmeansmk1(xdata, ydata, clusters):
|
||||
# Defining cluster points
|
||||
for i in range(0, clusters):
|
||||
globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]]
|
||||
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
|
||||
#get max data in the data arrays
|
||||
highpointx = dmlib.findHighest(xdata)
|
||||
highpointy = dmlib.findHighest(ydata)
|
||||
#print('highpoinx: ' + str(highpointx))
|
||||
#print('highpointy: ' + str(highpointy))
|
||||
|
||||
# Define variables for running the algorithm (runs is just as important as every other variable)
|
||||
done = 0
|
||||
runs = 0
|
||||
|
||||
# As long as calcClusters returns done it will rearrange the clusters and assign the data to the clusters
|
||||
while done == 0:
|
||||
runs = runs + 1
|
||||
assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy)
|
||||
#assigned_points consists of the clusternumbers
|
||||
done = calcClusters(xdata, ydata, assigned_points, clusters)
|
||||
|
||||
for i in range(0, clusters):
|
||||
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
|
||||
for i in range(0, clusters):
|
||||
plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro')
|
||||
plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k')
|
||||
plt.show()
|
||||
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
|
||||
def calcClusters(xdata, ydata, assigned_points, clusters):
|
||||
for cluster in range(0, clusters):
|
||||
cpointunchanged = 1
|
||||
globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)]
|
||||
clustersumx = 0
|
||||
clustersumy = 0
|
||||
count = 0
|
||||
#print('calcclusters running')
|
||||
for item in range(0, len(xdata)):
|
||||
if assigned_points[item] == cluster:
|
||||
clustersumx = clustersumx + int(xdata[item])
|
||||
clustersumy = clustersumy + int(ydata[item])
|
||||
count = count + 1
|
||||
# print('item ' + str(item) +'done')
|
||||
globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)]
|
||||
#print('cluster ' + str(cluster) + 'done')
|
||||
# checking if old clusterpoint is equal to the one just calculated
|
||||
if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]:
|
||||
cpointunchanged = 0
|
||||
|
||||
return cpointunchanged
|
||||
|
||||
|
||||
def assignCluster(xdata, ydata, clusters, highpointx, highpointy):
|
||||
data_assigned = []
|
||||
assigned_cluster = 0
|
||||
resetdist = dmlib.calcdiff2d([0,0],[highpointx, highpointy])
|
||||
#print('resetdist =' + str(resetdist))
|
||||
for item in range(0, len(xdata)):
|
||||
olddistance = resetdist
|
||||
for cluster in range(0, clusters):
|
||||
distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]])
|
||||
# print('distance from point ' + str(item) + ' to cluster ' + str(cluster) + ': ' + str(distance))
|
||||
if distance < olddistance:
|
||||
olddistance = distance
|
||||
assigned_cluster = cluster
|
||||
# print('cluster number ' + str(cluster) + ' assigned')
|
||||
data_assigned.append(assigned_cluster)
|
||||
# Add the assigned values list to the new_data array
|
||||
#new_data.append(data_assigned)
|
||||
|
||||
return data_assigned
|
||||
|
||||
|
||||
# Startup function for collecting necesarry xdata
|
||||
def startup(xdata, ydata):
|
||||
# Using two clusters for testing
|
||||
clusters = int(input("How many clusters are known? (hint: 2) "))
|
||||
# cores = input("How many cores should be used? ")
|
||||
# path = input("Where is the xdata? ") or in this case xdata
|
||||
|
||||
# For benchmarking starting the timer now
|
||||
start_time = time.time()
|
||||
|
||||
# Firing up the engines!
|
||||
kmeansmk1(xdata, ydata, clusters)
|
||||
|
||||
# Stopping benchmark
|
||||
seconds = time.time() - start_time
|
||||
print(str(seconds) + " seconds for execution")
|
||||
|
||||
|
||||
# Start the algorithm and generate test xdata
|
||||
xdata = dmtest.plzGenNS(1000)
|
||||
ydata = dmtest.ageGenNS(1000)
|
||||
|
||||
startup(xdata, ydata)
|
||||
Reference in New Issue
Block a user