From aa43c93ae5ed6da69deef177a44cc47f30524878 Mon Sep 17 00:00:00 2001
From: Conrad <grosserconrad@gmail.com>
Date: Sun, 3 Jun 2018 00:41:18 +0200
Subject: [PATCH] Added numGen generator

- Removed unstable randomi version
- Added numGen generator
- Detected bug
---
 src/algorithms/dmtest.py          | 20 ++++++-
 src/algorithms/kmeansMkI.py       |  6 +-
 src/data_generators/randomi2.1.py | 96 -------------------------------
 3 files changed, 23 insertions(+), 99 deletions(-)
 delete mode 100644 src/data_generators/randomi2.1.py

diff --git a/src/algorithms/dmtest.py b/src/algorithms/dmtest.py
index d13dc48..900b69a 100644
--- a/src/algorithms/dmtest.py
+++ b/src/algorithms/dmtest.py
@@ -31,4 +31,22 @@ def writeFile(content, nameChunkStart, namePartStart):
 	filenumber = int(nameChunkStart) + int(namePartStart)
 	file = open("testdata/file" + str(filenumber) + ".txt", "w")
 	for w in range(0, len(content)):
-		file.write(content[w] + "\n")
\ No newline at end of file
+		file.write(content[w] + "\n")
+
+# Function for generating 'entries'x int_lenght'-long numbers in 'clusters' clusters
+def numGen(entries, cluster, int_lenght):
+	dataArray = []
+	clusterArray = []
+
+	for cluster_num in range(0, cluster):
+		clusterArray.append(randint(10,99))
+
+	for item in range(0, entries):
+		decider = randint(0, 2)
+		if decider == 2:
+			dataArray.append(generateNumber(int_lenght, randint(1,9)))
+		else:
+			cluster_decider = randint(0, cluster - 1)
+			dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider]))
+	shuffle(dataArray)
+	return dataArray
diff --git a/src/algorithms/kmeansMkI.py b/src/algorithms/kmeansMkI.py
index f15b222..89a42bc 100644
--- a/src/algorithms/kmeansMkI.py
+++ b/src/algorithms/kmeansMkI.py
@@ -8,7 +8,7 @@
 #usage:				python pyscript.py
 #notes:
 #dependencies:		mathplotlib
-#known_issues:
+#known_issues:		When clusters are 'thin' or noice is to strong --> unaccurate
 #python_version:	3.x
 #==============================================================================
 
@@ -135,6 +135,8 @@ def startup(data):
 	print(str(seconds) + " seconds for execution")
 
 # Start the algorithm and generate test data
-data = dmtest.plzGen(10000)
+# data = dmtest.plzGen(10000)
+# data = dmtest.numGen(10000, 3, 5)
 
+data = dmtest.numGen(10000, 8, 7)
 startup(data)
diff --git a/src/data_generators/randomi2.1.py b/src/data_generators/randomi2.1.py
deleted file mode 100644
index b035f58..0000000
--- a/src/data_generators/randomi2.1.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#!/usr/bin/env python
-#title:				randomI2.1.py
-#description:		Personal 
-#author:			Tillmann Brendel, Conrad Großer
-#license:			Pending
-#date:				26.05.2018
-#version:			1.0
-#usage:				python pyscript.py
-#notes:
-#known_issues:
-#python_version:	3.x
-#==============================================================================
-
-# For random generation of numbers import randint
-from random import randint
-
-# Importing the time for benchmarking purposes
-import time
-from datetime import date
-
-# Importing for multi core processing
-import multiprocessing
-
-# randomI function which creates each file
-def randomI(units, rows, rowLength, partstart, cluster):
-	for setcounter in range(0, units):
-		writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart)
-
-# Function for generating the content of one single file
-def generateFile(rows, rowLength, cluster):
-	content = []
-	for y in range(0, rows):
-		if y == 0:
-			if 1 == randint(1, cluster):
-				content.append(generate09())
-			else:
-				content.append(generatePLZ())
-		else:
-			content.append(generateRow(rowLength))
-	return content
-
-# Function for generating the content of one single row randomly
-def generateRow(rowLength):
-	row = ""
-	for z in range(0, rowLength):
-		row = row + str(randint(0, 9))
-	return row
-
-# Function for writing data into a file (content = string, setcount and partstart are for better naming)
-def writeFile(content, setcounter, partstart):
-	filenumber = int(setcounter) + int(partstart)
-	file = open("testdata/file" + str(filenumber) + ".txt", "w")
-	for w in range(0, len(content)):
-		file.write(content[w] + "\n")
-
-if __name__ == '__main__':
-	# Getting the user input
-	print("Hello World")
-	units = int(input("How many units would you like to generate? "))
-	rows = int(input("How many rows should each unit have? "))
-	rowLength = int(input("How long should each row be? "))
-	cores = int(input("How many cores do you want to use? "))
-	cluster = int(input("What fraction of postal codes should be in the 09xxx cluster? 1/"))
-
-    # Splitting up the units
-	count = int(0)
-	partsize = units / cores
-
-	# For benchmarking starting the timer now
-	start_time = time.time()
-
-	# Initialize and prepare cores for process
-	while count < cores:
-		partstart = partsize * count
-		globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart, cluster))
-		count = count + 1
-
-	# Starting each core
-	count = int(0)
-	while count < cores:
-		globals()["p" + str(count)].start()
-		print("Core " + str(count) + " started.")
-		count = count + 1
-
-	print("Working...")
-
-	# Joining each core for the process
-	count = int(0)
-	while count < cores:
-		globals()["p" + str(count)].join()
-		count = count + 1
-
-	# Finishing up the process
-	sec = time.time() - start_time
-	print("Data is generated. Have fun!")
-	print("randomI took " + str(sec) + " seconds for execution.")