Compare commits

3 Commits

Author SHA1 Message Date
94ed193954 Update .gitignore 2019-04-25 12:45:05 +02:00
e2ad63f90f Code Rework 2019-04-25 12:43:37 +02:00
25fa068df9 kmeans ++ Update
- Added concept for kmeans ++
2018-06-02 23:31:43 +02:00
6 changed files with 276 additions and 255 deletions

1
.gitignore vendored
View File

@@ -1,2 +1,3 @@
testdata/ testdata/
__pycache__/ __pycache__/
.DS_Store

View File

@@ -1,40 +1,27 @@
# Calculate the difference between two points giving the indexes of these data entries # Calculate the difference between two points giving the indexes of these data entries
def calcdiff(point1, point2): def calcdiff(point1, point2):
if int(point2) > int(point1): if int(point2) > int(point1):
difference = int(point2) - int(point1) difference = int(point2) - int(point1)
else: else:
difference = int(point1) - int(point2) difference = int(point1) - int(point2)
# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference)) return abs(difference)
return betrag(difference)
# Get the absolute value of a number and returns it as int
def betrag(number):
if number < 0:
number = int((-2 * number) / 2)
return number
# Determine the highest int value in an array and returns is as an int
def findHighest(data):
maximum = 0
for i in range(0, len(data)):
if int(data[i]) > maximum:
maximum = int(data[i])
return maximum
def pp_calcdiff(data, clusterpoint): def pp_calcdiff(data, clusterpoint):
max_diff = 0 max_diff = 0
new_cluster = 0 new_cluster = 0
for item in range(0,len(data)): for item in range(0, len(data)):
if calcdiff(data[item], clusterpoint) > max_diff: if calcdiff(data[item], clusterpoint) > max_diff:
max_diff = calcdiff(data[item], clusterpoint) max_diff = calcdiff(data[item], clusterpoint)
new_cluster = data[item] new_cluster = data[item]
return new_cluster return new_cluster
def pp_calcdiff_2(data, clusterpoint, clusterpoint_2): def pp_calcdiff_2(data, clusterpoint, clusterpoint_2):
max_diff = 0 max_diff = 0
new_cluster = 0 new_cluster = 0
for item in range(0,len(data)): for item in range(0, len(data)):
if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff: if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
max_diff = calcdiff(data[item], clusterpoint) max_diff = calcdiff(data[item], clusterpoint)
new_cluster = data[item] new_cluster = data[item]
return new_cluster return new_cluster

View File

@@ -1,34 +1,38 @@
# For random generation of numbers import randint # For random generation of numbers import randint
from random import randint, shuffle from random import randint, shuffle
# Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs # Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs
def plzGen(entries): def plzGen(entries):
dataArray = [] dataArray = []
plz_lenght = 5 plz_lenght = 5
for i in range(0, int(entries)): for i in range(0, int(entries)):
if i < round(entries * 0.4): if i < round(entries * 0.4):
plz = generateNumber(plz_lenght, 2) plz = generateNumber(plz_lenght, 2)
elif i >= round(entries * 0.4) and i < round(entries * 0.6): elif i >= round(entries * 0.4) and i < round(entries * 0.6):
plz = generateNumber(plz_lenght, 9) plz = generateNumber(plz_lenght, 9)
elif i >= round(entries * 0.6) and i < round(entries * 0.9): elif i >= round(entries * 0.6) and i < round(entries * 0.9):
plz = generateNumber(plz_lenght, 4) plz = generateNumber(plz_lenght, 4)
else: else:
plz = generateNumber(plz_lenght, randint(0,9)) plz = generateNumber(plz_lenght, randint(0, 9))
dataArray.append(plz) dataArray.append(plz)
shuffle(dataArray) shuffle(dataArray)
return dataArray return dataArray
# Function for generating the content of one single row randomly # Function for generating the content of one single row randomly
def generateNumber(numberLenght, startingNumber): def generateNumber(numberLenght, startingNumber):
number = str(startingNumber) number = str(startingNumber)
for length in range(0, numberLenght - 1): for length in range(0, numberLenght - 1):
number = number + str(randint(0,9)) number = number + str(randint(0, 9))
return number return number
# Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming) # Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming)
# /testdata/ folder has to be created at this point # /testdata/ folder has to be created at this point
def writeFile(content, nameChunkStart, namePartStart): def writeFile(content, nameChunkStart, namePartStart):
filenumber = int(nameChunkStart) + int(namePartStart) filenumber = int(nameChunkStart) + int(namePartStart)
file = open("testdata/file" + str(filenumber) + ".txt", "w") file = open("testdata/file" + str(filenumber) + ".txt", "w")
for w in range(0, len(content)): for w in range(0, len(content)):
file.write(content[w] + "\n") file.write(content[w] + "\n")

View File

@@ -1,19 +1,18 @@
#!/usr/bin/env python #!/usr/bin/env python
#title: kmeansMkI.py # title: kmeansMkI.py
#description: Our personal Python K-Means++ implementation # description: Our personal Python K-Means++ implementation
#author: Tillmann Brendel, Conrad Großer # author: Tillmann Brendel, Conrad Großer
#license: Pending # license: Pending
#date: 26.05.2018 # date: 26.05.2018
#version: 1.2 # version: 1.2
#usage: python pyscript.py # usage: python pyscript.py
#notes: # notes:
#dependencies: mathplotlib # dependencies: mathplotlib
#known_issues: # known_issues:
#python_version: 3.x # python_version: 3.x
#============================================================================== # ==============================================================================
# IMPORTS # IMPORTS
# Importing the time for benchmarking purposes # Importing the time for benchmarking purposes
import time import time
from datetime import date from datetime import date
@@ -28,111 +27,120 @@ import matplotlib.pyplot as plt
import dmlib import dmlib
import dmtest import dmtest
# CODE
# Main function of the algorithm # Main function of the algorithm
def kmeansmk1(data, clusters): def kmeansmk1(data, clusters):
# Defining cluster points globals()["cpoint_0"] = data[randint(0, len(data))]
for i in range(0, clusters): globals()["cpoint_1"] = dmlib.pp_calcdiff(data, globals()["cpoint_0"])
globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
# Get max value in the data array print("Initial cluster 1: " + str(globals()["cpoint_0"]))
highPoint = dmlib.findHighest(data) print("Initial cluster 2: " + str(globals()["cpoint_1"]))
# Define variables for running the algorithm (runs is just for benchmarking!) # Defining cluster points
done = 0 for i in range(2, clusters):
runs = 0 globals()["cpoint_" + str(i)] = dmlib.pp_calcdiff_2(data, globals()["cpoint_" + str(i - 1)], globals()["cpoint_" + str(i - 2)])
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters # Get max value in the data array
while done == 0: highPoint = max(data)
runs = runs + 1
new_data = assignCluster(data, highPoint, clusters)
done = calcClusters(new_data, clusters)
# Printing final clusters # Define variables for running the algorithm (runs is just for benchmarking!)
for i in range(0, clusters): done, runs = False, 0
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data # As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
anew = [] while not done:
inew = 0 runs += 1
while inew < len(data): new_data = assignCluster(data, highPoint, clusters)
anew.append(inew) done = calcClusters(new_data, clusters)
inew = inew + 1
# Drawing found clusters as lines # Printing final clusters
for i in range(0, clusters): for i in range(0, clusters):
plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r') print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
# Showing graph # Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k') anew, inew = [], 0
plt.show()
while inew < len(data):
anew.append(inew)
inew += 1
# Drawing found clusters as lines
for i in range(0, clusters):
plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
# Showing graph
plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
plt.show()
return 0
return 0
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster) # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(data, clusters): def calcClusters(data, clusters):
changed = 0 changed = False
for cluster in range(0, clusters): for cluster in range(0, clusters):
# Getting current cluster and saving it in temporary variable # Getting current cluster and saving it in temporary variable
prev_cluster = globals()["cpoint_" + str(cluster)] prev_cluster = globals()["cpoint_" + str(cluster)]
# Sum of the cluster to calculate average difference between cluster center and data points # Sum of the cluster to calculate average difference between cluster center and data points
clustersum = 0 clustersum = 0
item_count = 0 item_count = 0
for item in range(0, len(data[0])): for item in range(0, len(data[0])):
if data[1][item] == globals()["cpoint_" + str(cluster)]: if data[1][item] == globals()["cpoint_" + str(cluster)]:
clustersum = clustersum + int(data[0][item]) clustersum = clustersum + int(data[0][item])
item_count = item_count + 1 item_count = item_count + 1
globals()["cpoint_" + str(cluster)] = round(clustersum / item_count) globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
# Checking if previous clusterpoint is equal to the one just calculated # Checking if previous clusterpoint is equal to the one just calculated
if prev_cluster == globals()["cpoint_" + str(cluster)]: if prev_cluster == globals()["cpoint_" + str(cluster)]:
changed = 1 changed = True
return changed
return changed
def assignCluster(data, highPoint, clusters): def assignCluster(data, highPoint, clusters):
# Create a new data array for working # Create a new data array for working
new_data = [] new_data = [data]
new_data.append(data)
# Create new array for assigned clusters of each value # Create new array for assigned clusters of each value
data_assigned = [] data_assigned = []
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index]) # For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
for item in range(0, len(new_data[0])): for item in data:
# Set the minimal cluster difference to the highest difference in the list to ease comparision # Set the minimal cluster difference to the highest difference in the list to ease comparision
min_cluster = highPoint min_cluster = highPoint
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference # Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
for cluster in range(0, clusters): for cluster in range(0, clusters):
if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]): if int(min_cluster) > dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)]):
min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]) min_cluster = dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)])
assinged_cluster = globals()["cpoint_" + str(cluster)] assinged_cluster = globals()["cpoint_" + str(cluster)]
# Assign the minimal difference cluster to the data
data_assigned.append(assinged_cluster) # Assign the minimal difference cluster to the data
# Add the assigned values list to the new_data array data_assigned.append(assinged_cluster)
new_data.append(data_assigned) # Add the assigned values list to the new_data array
new_data.append(data_assigned)
return new_data
return new_data
# Startup function for collecting necesarry data # Startup function for collecting necesarry data
def startup(data): def startup(data):
# Using two clusters for testing # Using two clusters for testing
clusters = int(input("How many clusters are known? ")) clusters = int(input("How many clusters are known? "))
# cores = input("How many cores should be used? ") # cores = input("How many cores should be used? ")
# path = input("Where is the data? ") or in this case data # path = input("Where is the data? ") or in this case data
# For benchmarking starting the timer now
start_time = time.time()
# Firing up the engines! # For benchmarking starting the timer now
kmeansmk1(data, clusters) start_time = time.time()
# Firing up the engines!
kmeansmk1(data, clusters)
# Stopping benchmark
seconds = time.time() - start_time
print(str(seconds) + " seconds for execution")
# Stopping benchmark
seconds = time.time() - start_time
print(str(seconds) + " seconds for execution")
# Start the algorithm and generate test data # Start the algorithm and generate test data
data = dmtest.plzGen(10000) data = dmtest.plzGen(10000)

View File

@@ -10,69 +10,79 @@ from datetime import date
# Importing for multi core processing # Importing for multi core processing
import multiprocessing import multiprocessing
# randomI function which creates each file # randomI function which creates each file
def randomI(units, rows, rowLength, partstart): def randomI(units, rows, rowLength, partstart):
for setcounter in range(0, units): for setcounter in range(0, units):
writeFile(generateFile(rows, rowLength), setcounter, partstart) writeFile(generateFile(rows, rowLength), setcounter, partstart)
return True
# Function for generating the content of one single file # Function for generating the content of one single file
def generateFile(rows, rowLength): def generateFile(rows, rowLength):
content = [] content = []
for y in range(0, rows): for entry in rows:
content.append(generateRow(rowLength)) content.append(generateRow(rowLength))
return content return content
# Function for generating the content of one single row randomly # Function for generating the content of one single row randomly
def generateRow(rowLength): def generateRow(rowLength):
row = "" row = ""
for z in range(0, rowLength): for z in range(0, rowLength):
row = row + str(randint(0, 9)) row += str(randint(0, 9))
return row return row
# Function for writing data into a file # Function for writing data into a file
def writeFile(content, setcounter, partstart): def writeFile(content, setcounter, partstart):
filenumber = int(setcounter) + int(partstart) filenumber = int(setcounter) + int(partstart)
file = open("testdata/file" + str(filenumber) + ".txt", "w") file = open("testdata/file" + str(filenumber) + ".txt", "w")
for w in range(0, len(content)): for line in content:
file.write(content[w] + "\n") file.write(line + "\n")
return True
if __name__ == '__main__': if __name__ == '__main__':
# Getting the user input # Getting the user input
print("Hello World") print("Hello World")
units = int(input("How many units would you like to generate? ")) units = int(input("How many units would you like to generate? "))
rows = int(input("How many rows should each unit have? ")) rows = int(input("How many rows should each unit have? "))
rowLength = int(input("How long should each row be? ")) rowLength = int(input("How long should each row be? "))
cores = int(input("How many cores do you want to use? ")) cores = int(input("How many cores do you want to use? "))
# Splitting up the units # Splitting up the units
count = int(0) count = 0
partsize = units / cores partsize = units / cores
# For benchmarking starting the timer now # For benchmarking starting the timer now
start_time = time.time() start_time = time.time()
# Initialize and prepare cores for process # Initialize and prepare cores for process
while count < cores: while count < cores:
partstart = partsize * count partstart = partsize * count
globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart)) globals()["p" + str(count)] = multiprocessing.Process(
count = count + 1 target=randomI,
args=(int(partsize), rows, rowLength, partstart)
)
count += 1
# Starting each core # Starting each core
count = int(0) count = 0
while count < cores: while count < cores:
globals()["p" + str(count)].start() globals()["p" + str(count)].start()
print("Core " + str(count) + " started.") print("Core " + str(count) + " started.")
count = count + 1 count += 1
print("Working...") print("Working...")
# Joining each core for the process # Joining each core for the process
count = int(0) count = 0
while count < cores: while count < cores:
globals()["p" + str(count)].join() globals()["p" + str(count)].join()
count = count + 1 count += 1
# Finishing up the process # Finishing up the process
sec = time.time() - start_time sec = time.time() - start_time
print("Data is generated. Have fun!") print("Data is generated. Have fun!")
print("randomI took " + str(sec) + " seconds for execution.") print("randomI took " + str(sec) + " seconds for execution.")

View File

@@ -1,15 +1,15 @@
#!/usr/bin/env python #!/usr/bin/env python
#title: randomI2.1.py # title: randomI2.1.py
#description: Personal # description: Personal
#author: Tillmann Brendel, Conrad Großer # author: Tillmann Brendel, Conrad Großer
#license: Pending # license: Pending
#date: 26.05.2018 # date: 26.05.2018
#version: 1.0 # version: 1.0
#usage: python pyscript.py # usage: python pyscript.py
#notes: # notes:
#known_issues: # known_issues:
#python_version: 3.x # python_version: 3.x
#============================================================================== # ==============================================================================
# For random generation of numbers import randint # For random generation of numbers import randint
from random import randint from random import randint
@@ -21,76 +21,87 @@ from datetime import date
# Importing for multi core processing # Importing for multi core processing
import multiprocessing import multiprocessing
# randomI function which creates each file # randomI function which creates each file
def randomI(units, rows, rowLength, partstart, cluster): def randomI(units, rows, rowLength, partstart, cluster):
for setcounter in range(0, units): for setcounter in range(0, units):
writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart) writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart)
return True
# Function for generating the content of one single file # Function for generating the content of one single file
def generateFile(rows, rowLength, cluster): def generateFile(rows, rowLength, cluster):
content = [] content = []
for y in range(0, rows): for entry in rows:
if y == 0: if entry == 0:
if 1 == randint(1, cluster): if randint(1, cluster) == 1:
content.append(generate09()) content.append(generate09())
else: else:
content.append(generatePLZ()) content.append(generatePLZ())
else: else:
content.append(generateRow(rowLength)) content.append(generateRow(rowLength))
return content return content
# Function for generating the content of one single row randomly # Function for generating the content of one single row randomly
def generateRow(rowLength): def generateRow(rowLength):
row = "" row = ''
for z in range(0, rowLength): for z in range(0, rowLength):
row = row + str(randint(0, 9)) row += str(randint(0, 9))
return row return row
# Function for writing data into a file (content = string, setcount and partstart are for better naming) # Function for writing data into a file (content = string, setcount and partstart are for better naming)
def writeFile(content, setcounter, partstart): def writeFile(content, setcounter, partstart):
filenumber = int(setcounter) + int(partstart) filenumber = int(setcounter) + int(partstart)
file = open("testdata/file" + str(filenumber) + ".txt", "w") file = open('testdata/file' + str(filenumber) + '.txt', 'w')
for w in range(0, len(content)):
file.write(content[w] + "\n") for line in content:
file.write(line + '\n')
return True
if __name__ == '__main__': if __name__ == '__main__':
# Getting the user input # Getting the user input
print("Hello World") print('Hello World')
units = int(input("How many units would you like to generate? ")) units = int(input('How many units would you like to generate? '))
rows = int(input("How many rows should each unit have? ")) rows = int(input('How many rows should each unit have? '))
rowLength = int(input("How long should each row be? ")) rowLength = int(input('How long should each row be? '))
cores = int(input("How many cores do you want to use? ")) cores = int(input('How many cores do you want to use? '))
cluster = int(input("What fraction of postal codes should be in the 09xxx cluster? 1/")) cluster = int(input('What fraction of postal codes should be in the 09xxx cluster? 1/'))
# Splitting up the units # Splitting up the units
count = int(0) count = 0
partsize = units / cores partsize = units / cores
# For benchmarking starting the timer now # For benchmarking starting the timer now
start_time = time.time() start_time = time.time()
# Initialize and prepare cores for process # Initialize and prepare cores for process
while count < cores: while count < cores:
partstart = partsize * count partstart = partsize * count
globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart, cluster)) globals()['p' + str(count)] = multiprocessing.Process(
count = count + 1 target=randomI,
args=(int(partsize), rows, rowLength, partstart, cluster)
)
count += 1
# Starting each core # Starting each core
count = int(0) count = int(0)
while count < cores: while count < cores:
globals()["p" + str(count)].start() globals()['p' + str(count)].start()
print("Core " + str(count) + " started.") print('Core ' + str(count) + ' started.')
count = count + 1 count += 1
print("Working...") print('Working...')
# Joining each core for the process # Joining each core for the process
count = int(0) count = 0
while count < cores: while count < cores:
globals()["p" + str(count)].join() globals()['p' + str(count)].join()
count = count + 1 count += 1
# Finishing up the process # Finishing up the process
sec = time.time() - start_time sec = time.time() - start_time
print("Data is generated. Have fun!") print('Data is generated. Have fun!')
print("randomI took " + str(sec) + " seconds for execution.") print('randomI took ' + str(sec) + ' seconds for execution.')