落花盈香: KNN实战

写在开头，以下是我在过程中遇到的一些问题
1.关于KNN算法代码里的一些问题
 1.1关于numpy的sum用法

Check out the documentation for numpy.sum, paying particular attention to the axis parameter. 
To sum over columns:
>>> import numpy as np
>>> a = np.arange(12).reshape(4,3)
>>> a.sum(axis=0)
array([18, 22, 26])
Or, to sum over rows:
>>> a.sum(axis=1)
array([ 3, 12, 21, 30])
 
 
 1.2关于numpy的tile
>>> b=tile(a,(2,1))
>>> b
array([[0, 1, 2],
       [0, 1, 2]]) 
 

 1.3关于numpy的argsort
One dimensional array:
>>> x = np.array([3, 1, 2])
>>> np.argsort(x)
array([1, 2, 0]) 
 
 2.1 关于matplotlib中figure问题
   MATLAB, and pyplot, have the concept of the current figure and the current axes. 
All plotting commands apply to the current axes.
The figure() command here is optional because figure(1) will be created by default,
just as a subplot(111) will be created by default if you don’t manually specify 
any axes. The subplot() command specifies numrows, numcols, fignum where fignum 
ranges from 1 to numrows*numcols. 
 


2.KNN算法代码
def classify0(inX,dataSet,labels,k):
 #calculate the distance
    dataSetSize = dataSet.shape[0]
    diffMat = tile(inx,(dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort()
    classCount = {}
    #choose the nearest point
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    #range at a list
    sortedClassCount = sorted(classCount.iteritems()),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]
 
3.从文件中读取数据的代码（这种处理文件的流程在很多代码处理文件时都会用到）
def file2matrix(filename):
    fr = open(filename)
    arrayOfLines = fr.readlines()
    numberOfLines = len(arrayOfLines)
    returnMat = zeros(numberOfLines,3)
    classLabelVector = []
    index = 0
    for line in arrayOfLines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector 
 
 4.完整的代码 
from numpy import *
import matplotlib
import matplotlib.pyplot as plt
import operator

def createDataSet():
 group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
 labels = ['A','A','B','B']
 return group,labels

def classify0(inX,dataSet,labels,k):
 #calculate the distance
    dataSetSize = dataSet.shape[0]
    diffMat = tile(inX,(dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort()
    classCount = {}
    #choose the nearest point
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    #range at a list
    sortedClassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
    print(sortedClassCount[0][0])
    return sortedClassCount[0][0]

def file2matrix(filename):
    fr = open(filename)
    arrayOfLines = fr.readlines()
    numberOfLines = len(arrayOfLines)
    returnMat = zeros((numberOfLines,3))
    classLabelVector = []
    index = 0
    for line in arrayOfLines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector

#Matplotlib
def draw_point(datingDataMat,datingLabels):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15*array(datingLabels),15*array(datingLabels))
    plt.show()

#due to difference value belongs to difference varies
#we norm the future for more precise outcome
def autoNorm(dataSet):
    minVals = dataSet.min(0) #0 represent gain the min in the column
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals,(m,1))
    normDataSet = normDataSet/tile(ranges,(m,1))
    return normDataSet,ranges,minVals

#Entry function(It is the function i used in test processor)
def run_knn():
    print("This is the entry function of all")
    datingDataMat,datingLabels = file2matrix("datingTestSet2.txt")
    datingDataMat,ranges,minVals = norm(datingDataMat)
    flag = 0
    true_outcome = 0
    for item in datingDataMat:
        true_labels=classify0(item,datingDataMat,datingLabels,3)
        if true_labels == datingLabels[flag]:
            true_outcome += 1
        print("The true label is %d",true_labels)
        print("The expect label is %d",datingLabels[flag])
        flag += 1
        
    print("The true outcome is:%d",true_outcome)
    print("The total number is:%d",flag)
    print("The right rate is:%f",1.0*true_outcome/flag)
    draw_point(datingDataMat,datingLabels)

#Test function 
def datingClassTest():
    hoRatio = 0.10
    datingDataMat,datingLabels = file2matrix("datingTestSet2.txt")
    normMat,ranges,minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\
                        datingLabels[numTestVecs:m],3)
        print "the classifier came back with: %d,the real answer is :%d"\
                %(classifierResult,datingLabels[i])
        if (classifierResult != datingLabels[i]): errorCount += 1.0
    print "the total error rate is :%f" %(errorCount/float(numTestVecs))
落花盈香

2017年6月13日星期二

KNN实战

没有评论:

发表评论

leetcode 17

搜索此博客