# 《Python机器学习——预测分析核心算法》——2.2　分类问题：用声纳发现未爆炸的水雷

### 2.2　分类问题：用声纳发现未爆炸的水雷

#### 2.2.1　“岩石vs.水雷”数据集的物理特性

（输出：outputRocksVMinesSummaries.txt）

__author__ = 'mike_bowles'
import urllib2
import sys

#read data from uci data repository
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")

data = urllib2.urlopen(target_url)

#arrange data into list for labels and list of lists for attributes
xList = []
labels = []
for line in data:
#split on comma
row = line.strip().split(",")
xList.append(row)

sys.stdout.write("Number of Rows of Data = " + str(len(xList)) + '\n')
sys.stdout.write("Number of Columns of Data = " + str(len(xList[1])))

Output:
Number of Rows of Data = 208
Number of Columns of Data = 61

（输出：outputRocksVMinesContents.txt）

author = 'mike_bowles'
import urllib2
import sys

# read data from uci data repository

target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")

data = urllib2.urlopen(target_url)

# arrange data into list for labels and list of lists for attributes

xList = []
labels = []
for line in data:

#split on comma
row = line.strip().split(",")
xList.append(row)

nrow = len(xList)
ncol = len(xList[1])

type = [0]*3
colCounts = []

for col in range(ncol):

for row in xList:
try:
a = float(row[col])
if isinstance(a, float):
type[0] += 1
except ValueError:
if len(row[col]) > 0:
type[1] += 1
else:
type[2] += 1

colCounts.append(type)
type = [0]*3


sys.stdout.write("Col#" + 't' + "Number" + 't' +

             "Strings" + '\t ' + "Other\n")

iCol = 0
for types in colCounts:

sys.stdout.write(str(iCol) + '\t\t' + str(types[0]) + '\t\t' +
str(types[1]) + '\t\t' + str(types[2]) + "\n")
iCol += 1


Output:
Col# Number Strings Other
0 208 0 0
1 208 0 0
2 208 0 0
3 208 0 0
4 208 0 0
5 208 0 0
6 208 0 0
7 208 0 0
8 208 0 0
9 208 0 0
10 208 0 0
11 208 0 0
. . . .
. . . .
. . . .
54 208 0 0
55 208 0 0
56 208 0 0
57 208 0 0
58 208 0 0
59 208 0 0
60 0 208 0

#### 2.2.2　“岩石vs.水雷”数据集统计特征

（输出：outputSummaryStats.txt）

__author__ = 'mike_bowles'
import urllib2
import sys
import numpy as np

#read data from uci data repository
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")
data = urllib2.urlopen(target_url)

#arrange data into list for labels and list of lists for attributes
xList = []
labels = []

for line in data:
#split on comma
row = line.strip().split(",")
xList.append(row)
nrow = len(xList)
ncol = len(xList[1])

type = [0]*3
colCounts = []

#generate summary statistics for column 3 (e.g.)
col = 3
colData = []
for row in xList:
colData.append(float(row[col]))

colArray = np.array(colData)
colMean = np.mean(colArray)
colsd = np.std(colArray)
sys.stdout.write("Mean = " + '\t' + str(colMean) + '\t\t' +
"Standard Deviation = " + '\t ' + str(colsd) + "\n")

#calculate quantile boundaries
ntiles = 4

percentBdry = []

for i in range(ntiles+1):
percentBdry.append(np.percentile(colArray, i*(100)/ntiles))

sys.stdout.write("\nBoundaries for 4 Equal Percentiles \n")
print(percentBdry)
sys.stdout.write(" \n")

#run again with 10 equal intervals
ntiles = 10

percentBdry = []

for i in range(ntiles+1):
percentBdry.append(np.percentile(colArray, i*(100)/ntiles))

sys.stdout.write("Boundaries for 10 Equal Percentiles \n")
print(percentBdry)
sys.stdout.write(" \n")

#The last column contains categorical variables

col = 60
colData = []
for row in xList:
colData.append(row[col])

unique = set(colData)
sys.stdout.write("Unique Label Values \n")
print(unique)

#count up the number of elements having each value

catDict = dict(zip(list(unique),range(len(unique))))

catCount = [0]*2

for elt in colData:
catCount[catDict[elt]] += 1
sys.stdout.write("\nCounts for Each Value of Categorical Label \n")
print(list(unique))
print(catCount)

Output:
Mean =   0.053892307       Standard Deviation =       0.046415983

Boundaries for 4 Equal Percentiles
[0.0057999999999999996, 0.024375000000000001, 0.044049999999999999,
0.064500000000000002, 0.4264]

Boundaries for 10 Equal Percentiles
[0.00579999999999, 0.0141, 0.022740000000, 0.0278699999999,
0.0362200000000, 0.0440499999999, 0.050719999999, 0.0599599999999,
0.0779400000000, 0.10836, 0.4264]
Unique Label Values
set(['R', 'M'])

Counts for Each Value of Categorical Label
['R', 'M']
[97, 111]

####2.2.3　用分位数图展示异常点

author = 'mike bowles'
import numpy as np
import pylab
import scipy.stats as stats
import urllib2
import sys

target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")

data = urllib2.urlopen(target_url)

# arrange data into list for labels and list of lists for attributes

xList = []
labels = []

for line in data:

#split on comma
row = line.strip().split(",")
xList.append(row)

nrow = len(xList)
ncol = len(xList[1])
type = [0]*3
colCounts = []

# generate summary statistics for column 3 (e.g.)

col = 3
colData = []
for row in xList:

colData.append(float(row[col]))


stats.probplot(colData, dist="norm", plot=pylab)
pylab.show()


<div style="text-align: center"><img src="https://yqfile.alicdn.com/55c85d987d43d66785f4db1ea559518b0e0a6369.png" width="" height="">
</div>

####2.2.4　类别属性的统计特征

####2.2.5　利用Python Pandas对“岩石vs.水雷”数据集进行统计分析
Python Pandas工具包可以帮助自动化数据统计分析的过程，已经被证实在数据预处理阶段特别有用。Pandas工具包可以将数据读入一种特定的数据结构，叫作数据框（data frame）。数据框是依据CRAN-R数据结构建模的。

Pandas工具包的安装可能会有困难，主要原因是它有一系列的依赖，每个依赖必须安装正确的版本，而且相互之间要匹配，或者诸如此类的问题。绕过此类障碍的一个简单的方法就是直接安装Anaconda Python Distribution分发包，此分发包可以直接从Continuum Analytics处下载。安装过程十分简单，只要按指令依次进行就可以安装好数据分析、机器学习所需的大量软件包。

author = 'mike_bowles'
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plot
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"
"databases/undocumented/connectionist-bench/sonar/sonar.all-data")

# print head and tail of data frame

print(rocksVMines.tail())

# print summary of data frame

summary = rocksVMines.describe()
print(summary)

Output (truncated):

   V0    V1     V2   ...    V57    V58    V59 V60

0 0.0200 0.0371 0.0428 ... 0.0084 0.0090 0.0032 R
1 0.0453 0.0523 0.0843 ... 0.0049 0.0052 0.0044 R
2 0.0262 0.0582 0.1099 ... 0.0164 0.0095 0.0078 R
3 0.0100 0.0171 0.0623 ... 0.0044 0.0040 0.0117 R
4 0.0762 0.0666 0.0481 ... 0.0048 0.0107 0.0094 R

[5 rows x 61 columns]

    V0     V1     V2   ...    V57    V58    V59 V60

203 0.0187 0.0346 0.0168 ... 0.0115 0.0193 0.0157 M
204 0.0323 0.0101 0.0298 ... 0.0032 0.0062 0.0067 M
205 0.0522 0.0437 0.0180 ... 0.0138 0.0077 0.0031 M
206 0.0303 0.0353 0.0490 ... 0.0079 0.0036 0.0048 M
207 0.0260 0.0363 0.0136 ... 0.0036 0.0061 0.0115 M

[5 rows x 61 columns]

          V0         V1   ...        V58        V59

count 208.000000 208.000000 ... 208.000000 208.000000
mean 0.029164 0.038437 ... 0.007941 0.006507
std 0.022991 0.032960 ... 0.006181 0.005031
min 0.001500 0.000600 ... 0.000100 0.000600
25% 0.013350 0.016450 ... 0.003675 0.003100
50% 0.022800 0.030800 ... 0.006400 0.005300
75% 0.035550 0.047950 ... 0.010325 0.008525
max 0.137100 0.233900 ... 0.036400 0.043900

读入数据后，程序第一部分首先打印头数据和尾数据。注意到所有的头数据都有Ｒ标签，所有的尾数据都有Ｍ标签。对于这个数据集，第一部分是Ｒ标签的（岩石），第二部分是Ｍ标签的（水雷）。在分析数据时首先要注意到此类信息。在后续章节中会看到，确定模型的优劣有时需要对数据进行取样。那么取样就需要考虑到数据的存储结构。最后的代码打印输出实数属性列的统计信息。
`

|
1天前
|

Python 与机器学习：构建高效数据处理流程

9 2
|
13天前
|

GEE机器学习——混淆矩阵Classifier.confusionMatrix()和errorMatrix()和exlain()的用法（js和python代码）
GEE机器学习——混淆矩阵Classifier.confusionMatrix()和errorMatrix()和exlain()的用法（js和python代码）
11 0
|
13天前
|

GEE机器学习——最大熵分类器案例分析（JavaScript和python代码）
GEE机器学习——最大熵分类器案例分析（JavaScript和python代码）
15 0
|
13天前
|

GEE机器学习——利用支持向量机SVM进行土地分类和精度评定
GEE机器学习——利用支持向量机SVM进行土地分类和精度评定
7 0
|
14天前
|

29 0
|
16天前
|

11 0
|
17天前
|

Python与机器学习：开启智能应用的新纪元

16 2
|
17天前
|

18 2
|

Python机器学习（sklearn）——分类模型评估与调参总结（下）
Python机器学习（sklearn）——分类模型评估与调参总结
4846 0
|

Python机器学习（sklearn）——分类模型评估与调参总结（上）
Python机器学习（sklearn）——分类模型评估与调参总结
6314 0

• 机器翻译
• 工业大脑

更多

更多

更多