Added numGen generator

- Removed unstable randomi version - Added numGen generator - Detected bug
2018-06-03 00:41:18 +02:00
parent 4e3ceac4a9
commit aa43c93ae5
3 changed files with 23 additions and 99 deletions
--- a/src/algorithms/dmtest.py
+++ b/src/algorithms/dmtest.py
@@ -31,4 +31,22 @@ def writeFile(content, nameChunkStart, namePartStart):
 	filenumber = int(nameChunkStart) + int(namePartStart)
 	file = open("testdata/file" + str(filenumber) + ".txt", "w")
 	for w in range(0, len(content)):
-		file.write(content[w] + "\n")
+		file.write(content[w] + "\n")
+
+# Function for generating 'entries'x int_lenght'-long numbers in 'clusters' clusters
+def numGen(entries, cluster, int_lenght):
+	dataArray = []
+	clusterArray = []
+
+	for cluster_num in range(0, cluster):
+		clusterArray.append(randint(10,99))
+
+	for item in range(0, entries):
+		decider = randint(0, 2)
+		if decider == 2:
+			dataArray.append(generateNumber(int_lenght, randint(1,9)))
+		else:
+			cluster_decider = randint(0, cluster - 1)
+			dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider]))
+	shuffle(dataArray)
+	return dataArray
--- a/src/algorithms/kmeansMkI.py
+++ b/src/algorithms/kmeansMkI.py
@@ -8,7 +8,7 @@
 #usage:				python pyscript.py
 #notes:
 #dependencies:		mathplotlib
-#known_issues:
+#known_issues:		When clusters are 'thin' or noice is to strong --> unaccurate
 #python_version:	3.x
 #==============================================================================

@@ -135,6 +135,8 @@ def startup(data):
 	print(str(seconds) + " seconds for execution")

 # Start the algorithm and generate test data
-data = dmtest.plzGen(10000)
+# data = dmtest.plzGen(10000)
+# data = dmtest.numGen(10000, 3, 5)

+data = dmtest.numGen(10000, 8, 7)
 startup(data)
--- a/src/data_generators/randomi2.1.py
+++ b/src/data_generators/randomi2.1.py
@@ -1,96 +0,0 @@
-#!/usr/bin/env python
-#title:				randomI2.1.py
-#description:		Personal 
-#author:			Tillmann Brendel, Conrad Großer
-#license:			Pending
-#date:				26.05.2018
-#version:			1.0
-#usage:				python pyscript.py
-#notes:
-#known_issues:
-#python_version:	3.x
-#==============================================================================
-
-# For random generation of numbers import randint
-from random import randint
-
-# Importing the time for benchmarking purposes
-import time
-from datetime import date
-
-# Importing for multi core processing
-import multiprocessing
-
-# randomI function which creates each file
-def randomI(units, rows, rowLength, partstart, cluster):
-	for setcounter in range(0, units):
-		writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart)
-
-# Function for generating the content of one single file
-def generateFile(rows, rowLength, cluster):
-	content = []
-	for y in range(0, rows):
-		if y == 0:
-			if 1 == randint(1, cluster):
-				content.append(generate09())
-			else:
-				content.append(generatePLZ())
-		else:
-			content.append(generateRow(rowLength))
-	return content
-
-# Function for generating the content of one single row randomly
-def generateRow(rowLength):
-	row = ""
-	for z in range(0, rowLength):
-		row = row + str(randint(0, 9))
-	return row
-
-# Function for writing data into a file (content = string, setcount and partstart are for better naming)
-def writeFile(content, setcounter, partstart):
-	filenumber = int(setcounter) + int(partstart)
-	file = open("testdata/file" + str(filenumber) + ".txt", "w")
-	for w in range(0, len(content)):
-		file.write(content[w] + "\n")
-
-if __name__ == '__main__':
-	# Getting the user input
-	print("Hello World")
-	units = int(input("How many units would you like to generate? "))
-	rows = int(input("How many rows should each unit have? "))
-	rowLength = int(input("How long should each row be? "))
-	cores = int(input("How many cores do you want to use? "))
-	cluster = int(input("What fraction of postal codes should be in the 09xxx cluster? 1/"))
-
-    # Splitting up the units
-	count = int(0)
-	partsize = units / cores
-
-	# For benchmarking starting the timer now
-	start_time = time.time()
-
-	# Initialize and prepare cores for process
-	while count < cores:
-		partstart = partsize * count
-		globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart, cluster))
-		count = count + 1
-
-	# Starting each core
-	count = int(0)
-	while count < cores:
-		globals()["p" + str(count)].start()
-		print("Core " + str(count) + " started.")
-		count = count + 1
-
-	print("Working...")
-
-	# Joining each core for the process
-	count = int(0)
-	while count < cores:
-		globals()["p" + str(count)].join()
-		count = count + 1
-
-	# Finishing up the process
-	sec = time.time() - start_time
-	print("Data is generated. Have fun!")
-	print("randomI took " + str(sec) + " seconds for execution.")