A supervised learning model.
Find the nearest K neighbors of a data point, assign according to majority vote.
The value K needs to be fine-tuned
Properties
- Needs a distance measure between two points $x_i=(x_i^{(1)}, x_i^{(2)},…, x_i^{(n)})$ and $x_j=(x_j^{(1)}, x_j^{(2)},…, x_j^{(n)})$ \(L_p(x_i, x_j) = (\sum^n_{l=1}|x^{(l)}_i - x^{(l)}_j|^{p})^{\frac{1}{p}}\)
- A non-parametric model, i.e. need to keep the entire dataset in order to make predictions.K-Dimensional Trees (K-d Trees)
A binary tree. Each node is a partition of the k-dimensional space.
Motivation
For each new point to be predicted, we first need to find its K nearest neighbors. This cannot be done efficiently without using some smart data structures.
Implementation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from math import sqrt
class KDTreeNode:
	def __init__(self, decision_dim, value, left=None, right=None):
		self.value = value
		self.left = left
		self.right = right
		self.decision_dim = decision_dim
	def printSelf(self):
		print(self)
	def __repr__(self):
		str_left = str(self.left) if self.left is not None else ""
		str_right = str(self.right) if self.right is not None else ""
		return str_left + str(self.value) + str_right
class KDTree:
	def __init__(self, dim):
		self.root = None
		self.dim = dim
	def insert(self, point):
		assert len(point) == self.dim
		if self.root is None:
			self.root = KDTreeNode(0, value=point)
		else:
			node = self.root
			while True:
				dim = node.decision_dim
				# The point lies on the decision surface
				# if point[dim] == node.values[0][dim]:
				#	node.values.append(point)
				#	return
				# The point should be in its left subspace 
				if point[dim] <= node.value[dim]:
					if node.left is None:
						node.left = KDTreeNode((dim + 1) % self.dim, point)
						return
					node = node.left
				# The point should be in its right subspace
				else:
					if node.right is None:
						node.right = KDTreeNode((dim + 1) % self.dim, point)
						return
					node = node.right
	def getClosestN(self, target, n=1):
		"""
			Reference: https://www.colorado.edu/amath/sites/default/files/attached-files/k-d_trees_and_knn_searches.pdf
		"""
		if self.root is None:
			return None
		closestN = []	# [(point, distance)], sorted by distance
		stack = []		# [(node, left_used, right_used)]
		node = self.root
		nth_closest = float('inf') 	# Current nth closest distance
		# Locate the finest subspace (i.e. a leaf node), and record the path along the way
		while True:
			dim = node.decision_dim
			if target[dim] <= node.value[dim]:
				stack.append([node, True, False])
				if node.left is None:
					break
				node = node.left
			else:
				stack.append([node, False, True])
				if node.right is None:
					break
				
				node = node.right
		# Search
		while stack:
			node, left_used, right_used = stack[-1]
			print(node.value, left_used, right_used)
			dim = node.decision_dim
			# Ignore the node if already used, or too far away to contain any closer point. 
			if ((left_used or node.left is None) and (right_used or node.right is None)) or \
				abs(node.value[dim] - target[dim]) > nth_closest:
				updateMinimum(closestN, n, node.value, calculateDistance(target, node.value))
				nth_closest = closestN[-1][1]
				stack.pop()
				continue
			# Check left subspace
			if not left_used and node.left is not None:
				# Set left as used
				stack[-1][1] = True
				# Append child node and check its distance
				stack.append([node.left, False, False])
			# Check right subspace
			elif not right_used and node.right is not None:
				# Set right as used
				stack[-1][2] = True
				# Append child node and check its distance
				stack.append([node.right, False, False])
			else:
				stack.pop()
		return closestN
	def printSelf(self):
		if self.root is not None:
			self.root.printSelf()
	def __str__(self):
		return str(self.root)
def updateMinimum(array, n, point, distance):
	# Assume that n is small enough so that we don't have to bother about performance...
	# Although using a heap would be ideal...
	length = len(array)
	assert length <= n
	array.append((point, distance))
	array.sort(key=lambda p: p[1])
	if len(array) > n:
		array.pop()
def calculateDistance(x, y):
	assert len(x) == len(y)
	square_dist = 0
	for v1, v2 in zip(x, y):
		square_dist += (v1 - v2) ** 2
	return sqrt(square_dist)
print("Checking insertion")
points = [(7,2),(5,4),(9,6),(2,3),(4,7),(8,1)]
# points = reversed([(2,3,1), (5,4,2), (9,6,3), (4,7,4), (8,1,5), (7,2,6)])
tree = KDTree(2)
for p in points:
	tree.insert(p)
print(tree)
print("Checking query algorithm")
points = [(3, 6), (17, 15), (13, 15), (6, 12), (9, 1), (2, 7), (10, 19)]
tree = KDTree(2)
for p in points:
	tree.insert(p)
print(tree.getClosestN((4,8), n=3))
