1.关于KNN算法代码里的一些问题
1.1关于numpy的sum用法
Check out the documentation for numpy.sum, paying particular attention to the axis parameter.
To sum over columns:
>>> import numpy as np
>>> a = np.arange(12).reshape(4,3)
>>> a.sum(axis=0)
array([18, 22, 26])
Or, to sum over rows:
>>> a.sum(axis=1)
array([ 3, 12, 21, 30])
1.2关于numpy的tile
>>> b=tile(a,(2,1))
>>> b
array([[0, 1, 2],
[0, 1, 2]])
1.3关于numpy的argsort
One dimensional array:
>>> x = np.array([3, 1, 2])
>>> np.argsort(x)
array([1, 2, 0])
2.1 关于matplotlib中figure问题
MATLAB, and pyplot, have the concept of the current figure and the current axes.
All plotting commands apply to the current axes.
The figure() command here is optional because figure(1) will be created by default,
just as a subplot(111) will be created by default if you don’t manually specify
any axes. The subplot() command specifies numrows, numcols, fignum where fignum
ranges from 1 to numrows*numcols.
2.KNN算法代码
def classify0(inX,dataSet,labels,k):
#calculate the distance
dataSetSize = dataSet.shape[0]
diffMat = tile(inx,(dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount = {}
#choose the nearest point
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
#range at a list
sortedClassCount = sorted(classCount.iteritems()),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
3.从文件中读取数据的代码(这种处理文件的流程在很多代码处理文件时都会用到)
def file2matrix(filename):
fr = open(filename)
arrayOfLines = fr.readlines()
numberOfLines = len(arrayOfLines)
returnMat = zeros(numberOfLines,3)
classLabelVector = []
index = 0
for line in arrayOfLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
4.完整的代码
from numpy import *
import matplotlib
import matplotlib.pyplot as plt
import operator
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group,labels
def classify0(inX,dataSet,labels,k):
#calculate the distance
dataSetSize = dataSet.shape[0]
diffMat = tile(inX,(dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount = {}
#choose the nearest point
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
#range at a list
sortedClassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
print(sortedClassCount[0][0])
return sortedClassCount[0][0]
def file2matrix(filename):
fr = open(filename)
arrayOfLines = fr.readlines()
numberOfLines = len(arrayOfLines)
returnMat = zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOfLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
#Matplotlib
def draw_point(datingDataMat,datingLabels):
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15*array(datingLabels),15*array(datingLabels))
plt.show()
#due to difference value belongs to difference varies
#we norm the future for more precise outcome
def autoNorm(dataSet):
minVals = dataSet.min(0) #0 represent gain the min in the column
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals,(m,1))
normDataSet = normDataSet/tile(ranges,(m,1))
return normDataSet,ranges,minVals
#Entry function(It is the function i used in test processor)
def run_knn():
print("This is the entry function of all")
datingDataMat,datingLabels = file2matrix("datingTestSet2.txt")
datingDataMat,ranges,minVals = norm(datingDataMat)
flag = 0
true_outcome = 0
for item in datingDataMat:
true_labels=classify0(item,datingDataMat,datingLabels,3)
if true_labels == datingLabels[flag]:
true_outcome += 1
print("The true label is %d",true_labels)
print("The expect label is %d",datingLabels[flag])
flag += 1
print("The true outcome is:%d",true_outcome)
print("The total number is:%d",flag)
print("The right rate is:%f",1.0*true_outcome/flag)
draw_point(datingDataMat,datingLabels)
#Test function
def datingClassTest():
hoRatio = 0.10
datingDataMat,datingLabels = file2matrix("datingTestSet2.txt")
normMat,ranges,minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\
datingLabels[numTestVecs:m],3)
print "the classifier came back with: %d,the real answer is :%d"\
%(classifierResult,datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is :%f" %(errorCount/float(numTestVecs))
没有评论:
发表评论