• 关于

    in_array()函数

    的搜索结果

回答

#coding=utf-8 import numpy as np import matplotlib.pylab as plt import random class NeuralNetwork(object): def __init__(self, sizes, act, act_derivative, cost_derivative): #sizes表示神经网络各层的神经元个数,第一层为输入层,最后一层为输出层 #act为神经元的激活函数 #act_derivative为激活函数的导数 #cost_derivative为损失函数的导数 self.num_layers = len(sizes) self.sizes = sizes self.biases = [np.random.randn(nueron_num, 1) for nueron_num in sizes[1:]] self.weights = [np.random.randn(next_layer_nueron_num, nueron_num) for nueron_num, next_layer_nueron_num in zip(sizes[:-1], sizes[1:])] self.act=act self.act_derivative=act_derivative self.cost_derivative=cost_derivative #前向反馈(正向传播) def feedforward(self, a): #逐层计算神经元的激活值,公式(4) for b, w in zip(self.biases, self.weights): a = self.act(np.dot(w, a)+b) return a #随机梯度下降算法 def SGD(self, training_data, epochs, batch_size, learning_rate): #将训练样本training_data随机分为若干个长度为batch_size的batch #使用各个batch的数据不断调整参数,学习率为learning_rate #迭代epochs次 n = len(training_data) for j in range(epochs): random.shuffle(training_data) batches = [training_data[k:k+batch_size] for k in range(0, n, batch_size)] for batch in batches: self.update_batch(batch, learning_rate) print("Epoch {0} complete".format(j)) def update_batch(self, batch, learning_rate): #根据一个batch中的训练样本,调整各个参数值 nabla_b = [np.zeros(b.shape) for b in self.biases] nabla_w = [np.zeros(w.shape) for w in self.weights] for x, y in batch: delta_nabla_b, delta_nabla_w = self.backprop(x, y) nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)] nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)] #计算梯度,并调整各个参数值 self.weights = [w-(learning_rate/len(batch))*nw for w, nw in zip(self.weights, nabla_w)] self.biases = [b-(learning_rate/len(batch))*nb for b, nb in zip(self.biases, nabla_b)] #反向传播 def backprop(self, x, y): #保存b和w的偏导数值 nabla_b = [np.zeros(b.shape) for b in self.biases] nabla_w = [np.zeros(w.shape) for w in self.weights] #正向传播 activation = x #保存每一层神经元的激活值 activations = [x] #保存每一层神经元的z值 zs = [] for b, w in zip(self.biases, self.weights): z = np.dot(w, activation)+b zs.append(z) activation = self.act(z) activations.append(activation) #反向传播得到各个参数的偏导数值 #公式(13) d = self.cost_derivative(activations[-1], y) * self.act_derivative(zs[-1]) #公式(17) nabla_b[-1] = d #公式(14) nabla_w[-1] = np.dot(d, activations[-2].transpose()) #反向逐层计算 for l in range(2, self.num_layers): z = zs[-l] sp = self.act_derivative(z) #公式(36),反向逐层求参数偏导 d = np.dot(self.weights[-l+1].transpose(), d) * sp #公式(38) nabla_b[-l] = d #公式(37) nabla_w[-l] = np.dot(d, activations[-l-1].transpose()) return (nabla_b, nabla_w) #距离函数的偏导数 def distance_derivative(output_activations, y): #损失函数的偏导数 return 2*(output_activations-y) # sigmoid函数 def sigmoid(z): return 1.0/(1.0+np.exp(-z)) # sigmoid函数的导数 def sigmoid_derivative(z): return sigmoid(z)*(1-sigmoid(z)) if __name__ == "__main__": #创建一个5层的全连接神经网络,每层的神经元个数为1,8,5,3,1 #其中第一层为输入层,最后一层为输出层 network=NeuralNetwork([1,8,5,3,1],sigmoid,sigmoid_derivative,distance_derivative) #训练集样本 x = np.array([np.linspace(-7, 7, 200)]).T #训练集结果,由于使用了sigmoid作为激活函数,需保证其结果落在(0,1)区间内 y = (np.cos(x)+1)/2 #使用随机梯度下降算法(SGD)对模型进行训练 #迭代5000次;每次随机抽取40个样本作为一个batch;学习率设为0.1 training_data=[(np.array([x_value]),np.array([y_value])) for x_value,y_value in zip(x,y)] network.SGD(training_data,5000,40,0.1) #测试集样本 x_test = np.array([np.linspace(-9, 9, 120)]) #测试集结果 y_predict = network.feedforward(x_test) #图示对比训练集和测试集数据 plt.plot(x,y,'r',x_test.T,y_predict.T,'*') plt.show()
珍宝珠 2019-12-02 03:22:25 0 浏览量 回答数 0

回答

对于需要调用C代码的一些小的问题,通常使用Python标准库中的 ctypes 模块就足够了。 要使用 ctypes ,你首先要确保你要访问的C代码已经被编译到和Python解释器兼容 (同样的架构、字大小、编译器等)的某个共享库中了。 为了进行本节的演示,假设你有一个共享库名字叫 libsample.so ,里面的内容就是15章介绍部分那样。 另外还假设这个 libsample.so 文件被放置到位于 sample.py 文件相同的目录中了。 要访问这个函数库,你要先构建一个包装它的Python模块,如下这样: # sample.py import ctypes import os # Try to locate the .so file in the same directory as this file _file = 'libsample.so' _path = os.path.join(*(os.path.split(__file__)[:-1] + (_file,))) _mod = ctypes.cdll.LoadLibrary(_path) # int gcd(int, int) gcd = _mod.gcd gcd.argtypes = (ctypes.c_int, ctypes.c_int) gcd.restype = ctypes.c_int # int in_mandel(double, double, int) in_mandel = _mod.in_mandel in_mandel.argtypes = (ctypes.c_double, ctypes.c_double, ctypes.c_int) in_mandel.restype = ctypes.c_int # int divide(int, int, int *) _divide = _mod.divide _divide.argtypes = (ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_int)) _divide.restype = ctypes.c_int def divide(x, y): rem = ctypes.c_int() quot = _divide(x, y, rem) return quot,rem.value # void avg(double *, int n) # Define a special type for the 'double *' argument class DoubleArrayType: def from_param(self, param): typename = type(param).__name__ if hasattr(self, 'from_' + typename): return getattr(self, 'from_' + typename)(param) elif isinstance(param, ctypes.Array): return param else: raise TypeError("Can't convert %s" % typename) # Cast from array.array objects def from_array(self, param): if param.typecode != 'd': raise TypeError('must be an array of doubles') ptr, _ = param.buffer_info() return ctypes.cast(ptr, ctypes.POINTER(ctypes.c_double)) # Cast from lists/tuples def from_list(self, param): val = ((ctypes.c_double)*len(param))(*param) return val from_tuple = from_list # Cast from a numpy array def from_ndarray(self, param): return param.ctypes.data_as(ctypes.POINTER(ctypes.c_double)) DoubleArray = DoubleArrayType() _avg = _mod.avg _avg.argtypes = (DoubleArray, ctypes.c_int) _avg.restype = ctypes.c_double def avg(values): return _avg(values, len(values)) # struct Point { } class Point(ctypes.Structure): _fields_ = [('x', ctypes.c_double), ('y', ctypes.c_double)] # double distance(Point *, Point *) distance = _mod.distance distance.argtypes = (ctypes.POINTER(Point), ctypes.POINTER(Point)) distance.restype = ctypes.c_double 如果一切正常,你就可以加载并使用里面定义的C函数了。例如: >>> import sample >>> sample.gcd(35,42) 7 >>> sample.in_mandel(0,0,500) 1 >>> sample.in_mandel(2.0,1.0,500) 0 >>> sample.divide(42,8) (5, 2) >>> sample.avg([1,2,3]) 2.0 >>> p1 = sample.Point(1,2) >>> p2 = sample.Point(4,5) >>> sample.distance(p1,p2) 4.242640687119285 >>>
哦哦喔 2020-04-17 18:10:17 0 浏览量 回答数 0

问题

将图像保存为numpy数组

我无法将图像加载到numpy数组并得到这样的错误... ValueError:无法将形状(175,217,3)的输入数组广播为形状(100,100,3) 功能代码: import cv2import numpy as npimport os...
一码平川MACHEL 2019-12-01 19:31:52 1020 浏览量 回答数 1

问题

10个你可能从未用过的PHP函数:报错

1. sys_getloadavg() sys_getloadavt()可以获得系统负载情况。该函数返回一个包含三个元素的数组,每个元素分别代表系统再过去的1、5和15分钟内的平均负载。 与其让服务器因高负载宕掉&#x...
kun坤 2020-06-12 22:13:10 0 浏览量 回答数 1

问题

10个你可能从未用过的PHP函数:配置报错 

1. sys_getloadavg() sys_getloadavt()可以获得系统负载情况。该函数返回一个包含三个元素的数组,每个元素分别代表系统再过去的1、5和15分钟内的平均负载。 与其让服务器因高负载宕掉,...
kun坤 2020-05-31 19:10:38 0 浏览量 回答数 1

回答

为了能让接受和处理数组具有可移植性,你需要使用到 Buffer Protocol . 下面是一个手写的C扩展函数例子, 用来接受数组数据并调用本章开篇部分的 avg(double *buf, int len) 函数: /* Call double avg(double *, int) */ static PyObject *py_avg(PyObject *self, PyObject *args) { PyObject *bufobj; Py_buffer view; double result; /* Get the passed Python object */ if (!PyArg_ParseTuple(args, "O", &bufobj)) { return NULL; } /* Attempt to extract buffer information from it */ if (PyObject_GetBuffer(bufobj, &view, PyBUF_ANY_CONTIGUOUS | PyBUF_FORMAT) == -1) { return NULL; } if (view.ndim != 1) { PyErr_SetString(PyExc_TypeError, "Expected a 1-dimensional array"); PyBuffer_Release(&view); return NULL; } /* Check the type of items in the array */ if (strcmp(view.format,"d") != 0) { PyErr_SetString(PyExc_TypeError, "Expected an array of doubles"); PyBuffer_Release(&view); return NULL; } /* Pass the raw buffer and size to the C function */ result = avg(view.buf, view.shape[0]); /* Indicate we're done working with the buffer */ PyBuffer_Release(&view); return Py_BuildValue("d", result); } 下面我们演示下这个扩展函数是如何工作的: >>> import array >>> avg(array.array('d',[1,2,3])) 2.0 >>> import numpy >>> avg(numpy.array([1.0,2.0,3.0])) 2.0 >>> avg([1,2,3]) Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: 'list' does not support the buffer interface >>> avg(b'Hello') Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: Expected an array of doubles >>> a = numpy.array([[1.,2.,3.],[4.,5.,6.]]) >>> avg(a[:,2]) Traceback (most recent call last): File "<stdin>", line 1, in <module> ValueError: ndarray is not contiguous >>> sample.avg(a) Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: Expected a 1-dimensional array >>> sample.avg(a[0]) 2.0 >>>
哦哦喔 2020-04-17 18:11:16 0 浏览量 回答数 0

回答

由于数据集与上次练习中使用的数据集相同,我们将重新使用上次的代码来加载数据。 上传参考链接:https://developer.aliyun.com/ask/260171 import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat %matplotlib inline data = loadmat('data/ex3data1.mat') data {'X': array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], ..., [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]), '__globals__': [], '__header__': 'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16 13:09:09 2011', '__version__': '1.0', 'y': array([[10], [10], [10], ..., [ 9], [ 9], [ 9]], dtype=uint8)} 我们以后需要和经常使用变量,先创建一些有用的变量。 X = data['X'] y = data['y'] X.shape, y.shape ((5000L, 400L), (5000L, 1L) )``` 我们还需要对标签进行专有热编码。专有热编码将类标签\(n \)(出于\(k \)类)转换为长度\(k \)的向量,其中索引\(n \)为“ hot”(1),其余为零。scikit-学习有一个内置的实用工具,我们可以使用它。 ```js from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder(sparse=False) y_onehot = encoder.fit_transform(y) y_onehot.shape (5000L, 10L) 为这个练习创建的神经网络具有与我们实例数据(400 +偏差单元)大小匹配的输入层,25个单位的隐藏层(带有26个偏差单元)和10个单位的输出层对应我们的独热编码类标签。我们需要实现成本函数,用它来评估一组给定的神经网络参数的损失,源数学函数有助于将成本函数分解成多个。以下是计算成本所需的函数。 def sigmoid(z): return 1 / (1 + np.exp(-z)) def forward_propagate(X, theta1, theta2): m = X.shape[0] a1 = np.insert(X, 0, values=np.ones(m), axis=1) z2 = a1 * theta1.T a2 = np.insert(sigmoid(z2), 0, values=np.ones(m), axis=1) z3 = a2 * theta2.T h = sigmoid(z3) return a1, z2, a2, z3, h def cost(params, input_size, hidden_size, num_labels, X, y, learning_rate): m = X.shape[0] X = np.matrix(X) y = np.matrix(y) # reshape the parameter array into parameter matrices for each layer theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1)))) theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1)))) # run the feed-forward pass a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2) # compute the cost J = 0 for i in range(m): first_term = np.multiply(-y[i,:], np.log(h[i,:])) second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:])) J += np.sum(first_term - second_term) J = J / m return J 我们之前已经使用过sigmoid函数。正向传播函数计算给定当前参数的每个训练实例的假设(换句话说,给定神经网络当前的状态和一组输入,它能计算出神经网络每一层假设向量(由\(h \)表示)的形状,包含了每个类的预测概率,应该与y的独热编码相匹配。最后成本函数运行正向传播步,并计算实例的假设(预测)和真实标签之间的误差。 可以快速测试一下它是否按预期的工作。从中间步骤中看到的输出也有助于了解发生了什么。 # initial setup input_size = 400 hidden_size = 25 num_labels = 10 learning_rate = 1 # randomly initialize a parameter array of the size of the full network's parameters params = (np.random.random(size=hidden_size * (input_size + 1) + num_labels * (hidden_size + 1)) - 0.5) * 0.25 m = X.shape[0] X = np.matrix(X) y = np.matrix(y) # unravel the parameter array into parameter matrices for each layer theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1)))) theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1)))) theta1.shape, theta2.shape ((25L, 401L), (10L, 26L)) a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2) a1.shape, z2.shape, a2.shape, z3.shape, h.shape ((5000L, 401L), (5000L, 25L), (5000L, 26L), (5000L, 10L), (5000L, 10L)) 计算假设矩阵\(h \)后的成本函数,用成本方程式计算\(y \)和\(h \)之间的总偏差。 cost(params, input_size, hidden_size, num_labels, X, y_onehot, learning_rate) 6.8228086634127862 下一步是在成本函数中增加正则化,增加了与参数大小相关的惩罚项。这个方程式可以归结为一行代码,将其添加到成本函数中。只需在返回语句之前添加以下内容。 J+= (float(learning_rate)/ (2 * m))* (np.sum(np.power(theta1[:,1:],2))+ np.sum(np.power(theta2[:,1:],2))) 接下来是反向传播算法,反向传播算法计算参数更新以减少训练数据的误差。我们首先需要的是一个函数,用来计算我们先前创建的Sigmoid函数梯度。 def sigmoid_gradient(z): return np.multiply(sigmoid(z), (1 - sigmoid(z))) 现在我们准备用反向传播算法来计算梯度,由于反向传播算法所需的计算是成本函数要求的超集,我们将扩展成本函数来执行反向传播算法,并返回成本和梯度函数。 backprop函数中调用了现有的成本函数来使设计更加正确的原因是,backprop函数使用了成本函数计算的一些其他变量。我跳过了完整的实现,添加了渐变正则化。 def backprop(params, input_size, hidden_size, num_labels, X, y, learning_rate): ##### this section is identical to the cost function logic we already saw ##### m = X.shape[0] X = np.matrix(X) y = np.matrix(y) # reshape the parameter array into parameter matrices for each layer theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1)))) theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1)))) # run the feed-forward pass a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2) # initializations J = 0 delta1 = np.zeros(theta1.shape) # (25, 401) delta2 = np.zeros(theta2.shape) # (10, 26) # compute the cost for i in range(m): first_term = np.multiply(-y[i,:], np.log(h[i,:])) second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:])) J += np.sum(first_term - second_term) J = J / m # add the cost regularization term J += (float(learning_rate) / (2 * m)) * (np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:], 2))) ##### end of cost function logic, below is the new part ##### # perform backpropagation for t in range(m): a1t = a1[t,:] # (1, 401) z2t = z2[t,:] # (1, 25) a2t = a2[t,:] # (1, 26) ht = h[t,:] # (1, 10) yt = y[t,:] # (1, 10) d3t = ht - yt # (1, 10) z2t = np.insert(z2t, 0, values=np.ones(1)) # (1, 26) d2t = np.multiply((theta2.T * d3t.T).T, sigmoid_gradient(z2t)) # (1, 26) delta1 = delta1 + (d2t[:,1:]).T * a1t delta2 = delta2 + d3t.T * a2t delta1 = delta1 / m delta2 = delta2 / m # add the gradient regularization term delta1[:,1:] = delta1[:,1:] + (theta1[:,1:] * learning_rate) / m delta2[:,1:] = delta2[:,1:] + (theta2[:,1:] * learning_rate) / m # unravel the gradient matrices into a single array grad = np.concatenate((np.ravel(delta1), np.ravel(delta2))) return J, grad 成本函数的第一部分通过“神经网络”(正向传播函数)运行数据和当前参数来计算误差,将输出与真实标签作比较。数据集的总误差表示为\(J \)。这部分是我们之前的过的成本函数。 成本函数的其余部分的本质是回答“下次运行网络时,如何调整参数以减少误差?”,它通过计算每层的贡献与总误差,提出“梯度”矩阵(或者改变参数和方向)进行适当调整。 backprop计算中最难的部分是获取矩阵维度。顺便说一下,不是只有你对使用A * B和np.multiply(A,B)感到疑惑。 让我们测试一下,以确保函数返回我们所期望的。 J, grad = backprop(params, input_size, hidden_size, num_labels, X, y_onehot, learning_rate) J, grad.shape (6.8281541822949299, (10285L,)) 最后训练我们的神经网络,利用它做出的预测,这和先前的多层次逻辑回归大致相同。 from scipy.optimize import minimize # minimize the objective function fmin = minimize(fun=backprop, x0=params, args=(input_size, hidden_size, num_labels, X, y_onehot, learning_rate), method='TNC', jac=True, options={'maxiter': 250}) fmin status: 3 success: False nfev: 250 fun: 0.33900736818312283 x: array([ -8.85740564e-01, 2.57420350e-04, -4.09396202e-04, ..., 1.44634791e+00, 1.68974302e+00, 7.10121593e-01]) message: 'Max. number of function evaluations reach' jac: array([ -5.11463703e-04, 5.14840700e-08, -8.18792403e-08, ..., -2.48297749e-04, -3.17870911e-04, -3.31404592e-04]) nit: 21 由于目标函数不太可能完全收敛,我们对迭代次数进行限制。我们的总成本已经下降到0.5以下,这是算法正常工作的一个指标。我们用它找到的参数,然后通过神经网络正向传播它们以获得一些预测。我们必须重构优化器的输出,以匹配神经网络所期望的参数矩阵形状,然后运行正向传播函数以生成输入数据的假设。 X = np.matrix(X) theta1 = np.matrix(np.reshape(fmin.x[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1)))) theta2 = np.matrix(np.reshape(fmin.x[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1)))) a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2) y_pred = np.array(np.argmax(h, axis=1) + 1) y_pred array([[10], [10], [10], ..., [ 9], [ 9], [ 9]], dtype=int64) 最后计算准确度以观察我们训练过的神经网络的工作状况 correct = [1 if a == b else 0 for (a, b) in zip(y_pred, y)] accuracy = (sum(map(int, correct)) / float(len(correct))) print 'accuracy = {0}%'.format(accuracy * 100) accuracy = 99.22% 我们完成了,我们已经成功地实施了一个基本的反向传播的前馈式神经网络,并用它来分类手写数字图像。
珍宝珠 2019-12-02 03:22:37 0 浏览量 回答数 0

回答

in_array()在多维数组上不起作用。您可以编写一个递归函数来为您做到这一点: function in_array_r($needle, $haystack, $strict = false) { foreach ($haystack as $item) { if (($strict ? $item === $needle : $item == $needle) || (is_array($item) && in_array_r($needle, $item, $strict))) { return true; } } return false; } 问题来源于stack overflow
保持可爱mmm 2020-01-16 15:43:59 0 浏览量 回答数 0

问题

RAY Python框架内存不足

我用ray创建了一个简单的远程功能,该功能占用很少的内存。但是,在短时间运行后,内存稳定增加,并且出现RayOutOfMemoryError异常。 以下代码是此问题的非常简单的示例。...
祖安文状元 2020-02-21 17:36:13 3 浏览量 回答数 1

问题

tensorflow LSTM时间序列预测问题?报错

#coding=utf-8 import numpy as np import tensorflow as tf import matplotlib as mpl mpl.use('Agg') from matplo...
爱吃鱼的程序员 2020-06-08 13:26:46 0 浏览量 回答数 1

回答

#include "stdafx.h" #include "gt.h" #include "gtDlg.h" #include<iostream> #include<fstream> #include<math.h> #include"hanshu.h" //声明各个函数 #include"changshu.h" //声明全局变量 using namespace std; #ifdef _DEBUG #define new DEBUG_NEW #undef THIS_FILE static char THIS_FILE[] = __FILE__; #endif ///////////////////////////////////////////////////////////////////////////// // CAboutDlg dialog used for App About class CAboutDlg : public CDialog { public: CAboutDlg(); // Dialog Data //{{AFX_DATA(CAboutDlg) enum { IDD = IDD_ABOUTBOX }; //}}AFX_DATA // ClassWizard generated virtual function overrides //{{AFX_VIRTUAL(CAboutDlg) protected: virtual void DoDataExchange(CDataExchange* pDX); // DDX/DDV support //}}AFX_VIRTUAL // Implementation protected: //{{AFX_MSG(CAboutDlg) //}}AFX_MSG DECLARE_MESSAGE_MAP() }; CAboutDlg::CAboutDlg() : CDialog(CAboutDlg::IDD) { //{{AFX_DATA_INIT(CAboutDlg) //}}AFX_DATA_INIT } void CAboutDlg::DoDataExchange(CDataExchange* pDX) { CDialog::DoDataExchange(pDX); //{{AFX_DATA_MAP(CAboutDlg) //}}AFX_DATA_MAP } BEGIN_MESSAGE_MAP(CAboutDlg, CDialog) //{{AFX_MSG_MAP(CAboutDlg) // No message handlers //}}AFX_MSG_MAP END_MESSAGE_MAP() ///////////////////////////////////////////////////////////////////////////// // CGtDlg dialog CGtDlg::CGtDlg(CWnd* pParent /*=NULL*/) : CDialog(CGtDlg::IDD, pParent) { //{{AFX_DATA_INIT(CGtDlg) // NOTE: the ClassWizard will add member initialization here //}}AFX_DATA_INIT // Note that LoadIcon does not require a subsequent DestroyIcon in Win32 m_hIcon = AfxGetApp()->LoadIcon(IDR_MAINFRAME); } void CGtDlg::DoDataExchange(CDataExchange* pDX) { CDialog::DoDataExchange(pDX); //{{AFX_DATA_MAP(CGtDlg) DDX_Control(pDX, IDC_NTGRAPHCTRL1, m_Graph); //}}AFX_DATA_MAP } BEGIN_MESSAGE_MAP(CGtDlg, CDialog) //{{AFX_MSG_MAP(CGtDlg) ON_WM_SYSCOMMAND() ON_WM_PAINT() ON_WM_QUERYDRAGICON() ON_BN_CLICKED(IDC_BUTTON1, OnButton1) //}}AFX_MSG_MAP END_MESSAGE_MAP() ///////////////////////////////////////////////////////////////////////////// // CGtDlg message handlers BOOL CGtDlg::OnInitDialog() { CDialog::OnInitDialog(); // Add "About..." menu item to system menu. // IDM_ABOUTBOX must be in the system command range. ASSERT((IDM_ABOUTBOX & 0xFFF0) == IDM_ABOUTBOX); ASSERT(IDM_ABOUTBOX < 0xF000); CMenu* pSysMenu = GetSystemMenu(FALSE); if (pSysMenu != NULL) { CString strAboutMenu; strAboutMenu.LoadString(IDS_ABOUTBOX); if (!strAboutMenu.IsEmpty()) { pSysMenu->AppendMenu(MF_SEPARATOR); pSysMenu->AppendMenu(MF_STRING, IDM_ABOUTBOX, strAboutMenu); } } // Set the icon for this dialog. The framework does this automatically // when the application's main window is not a dialog SetIcon(m_hIcon, TRUE); // Set big icon SetIcon(m_hIcon, FALSE); // Set small icon // TODO: Add extra initialization here return TRUE; // return TRUE unless you set the focus to a control } void CGtDlg::OnSysCommand(UINT nID, LPARAM lParam) { if ((nID & 0xFFF0) == IDM_ABOUTBOX) { CAboutDlg dlgAbout; dlgAbout.DoModal(); } else { CDialog::OnSysCommand(nID, lParam); } } // If you add a minimize button to your dialog, you will need the code below // to draw the icon. For MFC applications using the document/view model, // this is automatically done for you by the framework. void CGtDlg::OnPaint() { if (IsIconic()) { CPaintDC dc(this); // device context for painting SendMessage(WM_ICONERASEBKGND, (WPARAM) dc.GetSafeHdc(), 0); // Center icon in client rectangle int cxIcon = GetSystemMetrics(SM_CXICON); int cyIcon = GetSystemMetrics(SM_CYICON); CRect rect; GetClientRect(&rect); int x = (rect.Width() - cxIcon + 1) / 2; int y = (rect.Height() - cyIcon + 1) / 2; // Draw the icon dc.DrawIcon(x, y, m_hIcon); } else { CDialog::OnPaint(); } } // The system calls this to obtain the cursor to display while the user drags // the minimized window. HCURSOR CGtDlg::OnQueryDragIcon() { return (HCURSOR) m_hIcon; } CNTGraph * g; int i; void display(double x, double y) { g->PlotXY(i, x, 0); g->PlotXY(i++, y*5000, 1); } void CGtDlg::OnButton1() { g = &m_Graph; i = 0; m_Graph.SetShowGrid(TRUE); m_Graph.SetElementLineColor(RGB(255,0,0)); m_Graph.SetRange (0,100,0,8000); m_Graph.AddElement(); m_Graph.SetElementLineColor(RGB(0,0,100)); m_Graph.SetRange (0,100,0,8000); m_Graph.SetElementIdentify(FALSE); /*打开文件*/ ifstream file; //打开文本文件 file.open("C:\\Program Files\\Microsoft Visual Studio\\MyProjects\\gt\\06_hit08.txt"); //06_hit08是存放9列数据的txt文件,只取前3000行。 if (!file) //判断文件是否打开 { cerr << "error: unable to open input file: " << file << endl; file.close(); file.clear(); } struct list //给txt文件的9列分别命名 { int list1; //公式中未用 double v0; double S; double list4; //未用 double gamair; double gamself; double E; double n; double list9; //未用 }; //将txt文件中的9列放在数组中 list txt[3000]; //ROW=3000 int i; for(i=0;i<ROW;i++) { file>>txt[i].list1>>txt[i].v0>>txt[i].S>>txt[i].list4>>txt[i].gamair >>txt[i].gamself>>txt[i].E>>txt[i].n>>txt[i].list9; } file.close(); //关闭文件 double array_tran[3000]; //ROW=3000 for(i=0;i<3000;i++) { double y_seta, y_aL, y_aG, y_fL, y_fG, y_CH4, y_tao, y_tran; //用来保存公式的计算结果 y_seta=seta(txt[i].S , txt[i].E , txt[i].v0); //调用线强修正函数seta x=txt[i].v0,y=y_seta画图 y_aL=aL(txt[i].n, txt[i].gamair, txt[i].gamself); //计算洛伦兹半宽度 y_aG=aG(txt[i].v0); //计算多普勒半宽度 y_fL=fL(y_aL,txt[i].v0,i); //洛伦兹展宽 x=txt[i].v0,y=y_fL画图 y_fG=fG(y_aG,txt[i].v0,i); //多普勒展宽 x=txt[i].v0,y=y_fG画图 y_CH4=CH4(y_fL,y_fG,y_seta); //CH4吸收截面 x=txt[i].v0,y=y_CH4画图 y_tao=tao(y_CH4); //光学厚度 x=txt[i].v0,y=y_tao画图 y_tran=tran(y_tao); //透过率 x=txt[i].v0,y=y_tran画图 array_tran[i]=y_tran; } avgs(txt[1].v0,array_tran, display); }
a123456678 2019-12-02 02:17:44 0 浏览量 回答数 0

问题

vc中程序调用ntgraph控件

#include "stdafx.h" #include "gt.h" #include "gtDlg.h" #include&lt;iostream&gt; #include&lt;fstream&gt; #include&lt;mat...
a123456678 2019-12-01 19:22:49 1175 浏览量 回答数 1

回答

目前,Ray支持部分引用计数。(完整的参考计数将很快发布)。简而言之,当传递给远程函数的object_id未序列化时,引用引用的计数方式与引用Python的计数方式相同。这意味着,如果result_transformed是由Python收集的垃圾,result_transformed则应取消固定血浆存储中的,并在将对象清除为LRU时将其清除。(为清楚起见,不会清除具有某些引用计数的固定对象)。 我还假设存在一些奇怪的引用计数,例如循环引用。result_transformed运行该脚本时,我可以验证它是否被驱逐了。因此,我认为result_transformed本身不是问题。可能存在许多问题。就我而言,我发现当我将ipython用作输入(IN)时,它会创建对python对象的引用。(例如,当您看到某个对象的值时,OUT [number]可以引用您的对象)。 In [2]: import psutil ...: import gc ...: import ray ...: from time import sleep ...: import numpy as np ...: @ray.remote ...: def calc_similarity(sims, offset): ...: # Fake some work for 100 ms. ...: sleep(0.10) ...: return True ...: ...: if __name__ == "__main__": ...: # Initialize RAY to use all of the processors. ...: num_cpus = psutil.cpu_count(logical=False) ...: ray.init(num_cpus=num_cpus) ...: ...: num_docs = 1000000 ...: num_dimensions = 300 ...: chunk_size = 128 ...: sim_pct = 0.82 ...: ...: # Initialize the array ...: index = np.random.random((num_docs, num_dimensions)).astype(dtype=np.float32) ...: index_array = np.arange(num_docs).reshape(1, num_docs) ...: index_array_id = ray.put(index_array) ...: ...: calc_results = [] ...: i = 0 ...: for count, start_doc_no in enumerate(range(0, num_docs, chunk_size)): ...: i += 1 ...: size = min( chunk_size, num_docs - (start_doc_no) + 1 ) ...: # Get the query vector out of the index. ...: query_vector = index[start_doc_no:start_doc_no+size] ...: # Calculate the matrix multiplication. ...: result_transformed = np.matmul(index, query_vector.T).T ...: # Serialize the result matrix out for each client. ...: result_id = ray.put(result_transformed) ...: if i == 1: ...: # The first result_id binary number should be stored in result_id_special ...: # In this way, we can verify if this object id is evicted after filling up our ...: # plasma store by some random numpy array ...: # If this object id is not evicted, that means it is pinned, meaning if is ...: # not properly reference counted. ...: first_object_id = result_id.binary() ...: # Simulate multi-threading extracting the results of a cosine similarity calculation ...: for offset in range(chunk_size): ...: calc_results.append(calc_similarity.remote(sims=result_id, offset=offset )) ...: # , index_array=index_array_id)) ...: res = ray.get(calc_results) ...: calc_results.clear() ...: print('ref count to result_id {}'.format(len(gc.get_referrers(result_id)))) ...: print('Total number of ref counts in a ray cluster. {}'.format(ray.worker.global_worker.core_worker.get_all_reference_counts())) ...: if i == 5: ...: break ...: # It should contain the object id of the ...: print('first object id: {}'.format(first_object_id)) ...: print('fill up plasma store by big numpy arrays. This should evict the first_object_id from the plasma store.') ...: print('because if the data_transformed is garbage collected properly, it should be unpinned from plasma store') ...: print('and when plasma store is filled by numpy array, first_object_id should be evicted.') ...: for _ in range(40): ...: import numpy as np ...: ray.put(np.zeros(500 * 1024 * 1024, dtype=np.uint8)) ...: print('total ref count from a ray cluster after eviction: {}'.format(ray.worker.global_worker.core_worker.get_all_reference_counts())) ...: # this should fail as first_object_id is already evicted ...: print(ray.get(ray.ObjectID(first_object_id))) [ray] Forcing OMP_NUM_THREADS=1 to avoid performance degradation with many workers (issue #6998). You can override this by explicitly setting OMP_NUM_THREADS. 2020-02-12 00:10:11,932 INFO resource_spec.py:212 -- Starting Ray with 4.35 GiB memory available for workers and up to 2.19 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>). 2020-02-12 00:10:12,273 INFO services.py:1080 -- View the Ray dashboard at localhost:8265 2020-02-12 00:10:18,522 WARNING worker.py:289 -- OMP_NUM_THREADS=1 is set, this may slow down ray.put() for large objects (issue #6998). ref count to result_id 1 Total number of ref counts in a ray cluster. {ObjectID(ffffffffffffffffffffffff0100008002000000): {'local': 1, 'submitted': 0}, ObjectID(ffffffffffffffffffffffff0100008001000000): {'local': 1, 'submitted': 0}} ref count to result_id 1 Total number of ref counts in a ray cluster. {ObjectID(ffffffffffffffffffffffff0100008003000000): {'local': 1, 'submitted': 0}, ObjectID(ffffffffffffffffffffffff0100008001000000): {'local': 1, 'submitted': 0}} ref count to result_id 1 Total number of ref counts in a ray cluster. {ObjectID(ffffffffffffffffffffffff0100008001000000): {'local': 1, 'submitted': 0}, ObjectID(ffffffffffffffffffffffff0100008004000000): {'local': 1, 'submitted': 0}} ref count to result_id 1 Total number of ref counts in a ray cluster. {ObjectID(ffffffffffffffffffffffff0100008001000000): {'local': 1, 'submitted': 0}, ObjectID(ffffffffffffffffffffffff0100008005000000): {'local': 1, 'submitted': 0}} ref count to result_id 1 Total number of ref counts in a ray cluster. {ObjectID(ffffffffffffffffffffffff0100008006000000): {'local': 1, 'submitted': 0}, ObjectID(ffffffffffffffffffffffff0100008001000000): {'local': 1, 'submitted': 0}} first object id: b'\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x01\x00\x00\x80\x02\x00\x00\x00' fill up plasma store by big numpy arrays. This should evict the first_object_id from the plasma store. because if the data_transformed is garbage collected properly, it should be unpinned from plasma store and when plasma store is filled by numpy array, first_object_id should be evicted. total ref count from a ray cluster after eviction: {ObjectID(ffffffffffffffffffffffff0100008006000000): {'local': 1, 'submitted': 0}, ObjectID(ffffffffffffffffffffffff0100008001000000): {'local': 1, 'submitted': 0}} 2020-02-12 00:10:57,108 WARNING worker.py:1515 -- Local object store memory usage: num clients with quota: 0 quota map size: 0 pinned quota map size: 0 allocated bytes: 2092865189 allocation limit: 2347285708 pinned bytes: 520000477 (global lru) capacity: 2347285708 (global lru) used: 67.0078% (global lru) num objects: 4 (global lru) num evictions: 41 (global lru) bytes evicted: 21446665725 2020-02-12 00:10:57,112 WARNING worker.py:1072 -- The task with ID ffffffffffffffffffffffff0100 is a driver task and so the object created by ray.put could not be reconstructed. --------------------------------------------------------------------------- UnreconstructableError Traceback (most recent call last) <ipython-input-1-184e5836123c> in <module> 63 print('total ref count from a ray cluster after eviction: {}'.format(ray.worker.global_worker.core_worker.get_all_reference_counts())) 64 # this should fail as first_object_id is already evicted ---> 65 print(ray.get(ray.ObjectID(first_object_id))) 66 ~/work/ray/python/ray/worker.py in get(object_ids, timeout) 1517 raise value.as_instanceof_cause() 1518 else: -> 1519 raise value 1520 1521 # Run post processors. UnreconstructableError: Object ffffffffffffffffffffffff0100008002000000 is lost (either LRU evicted or deleted by user) and cannot be reconstructed. Try increasing the object store memory available with ray.init(object_store_memory=<bytes>) or setting object store limits with ray.remote(object_store_memory=<bytes>). See also: https://ray.readthedocs.io/en/latest/memory-management.html
祖安文状元 2020-02-22 10:27:48 0 浏览量 回答数 0

回答

ThinkPHP 自动验证定义的附加规则如下:regex:使用正则进行验证(默认)unique:验证唯一性confirm:验证表单中的两个字段是否相同equal:验证是否等于某个值in:验证是否在某个范围内function:使用函数验证callback:使用方法验证自动验证例子各种自动验证参考例子如下:// 默认情况下用正则进行验证 array('title','require','标题不能为空。'), array('order','number','排序必须是数字。',2), array('email','email','邮箱格式不符合要求。'), array('qq','qq','QQ号码不正确。'), // 在新增的时候验证标题title字段是否唯一 array('title','','标题已经存在!',0,'unique',1), // 验证确认密码是否和密码一致 array('repassword','password','确认密码不正确。',0,'confirm'), // 验证class填写的值为 一班 array('class','一班','班级必须填写一班。',0,'equal'), // 当值不为空的时候判断是否在一个范围内 array('value',array(1,2,3),'值的范围不正确。',2,'in'), // 自定义函数验证用户名格式 array('username','checkName','用户名格式不正确。',0,'function'), // 在注册或更改资料是调用 checkEmail 方法检查邮箱 array('email','checkEmail',1,'callback'), 使用正则表达式(regex)验证上述几类附加规则中,使用正则表达式是经常使用的,也是系统默认的验证附加规则。系统内置了如下正则检测规则:require(必须)、email(邮箱格式)、url(URL地址)、currency(货币)、number(数字)、qq(QQ号码)、english(英文字符)。这些附加规则可以直接使用,如果这些附加规则无法满足要求,可以使用自定义的正则规则:array('username','/^{3,15}$/','用户名不符合要求。'),该规则要求用户名只能为英文字符及下划线和数字组成,且长度为3-15个字节。要了解更多的正则表达式规则参见《PHP 常用正则表达式整理》。使用自定义函数(function)验证使用自定义函数验证附加规则,函数可以是 Common/common.php 里的自定义函数,也可以是 PHP 的内置函数: class UserModel extends Model{ protected $_validate = array( array('username','checkName','用户名不符合要求。',0,'function'), }; } 自定义 checkName 函数: function checkName($username){ if(!preg_match('/^{3,15}$/', $username)){ return false; }else{ return true; } } 提示:对于用户名的规则可以直接使用正则验证而无需函数,在此只是为了演示自定义函数的验证的用法而已。使用方法(callback)验证ThinkPHP 自动验证还支持调用当前 Model 类的一个方法来进行验证。 class UserModel extends Model{ protected $_validate = array( array('email','checkEmail','邮箱已经存在。',1,'callback'), }; // checkEmail方法 2 protected function checkEmail(){ $User=new Model('User'); // 新用户注册,验证唯一 if(empty($_POST<'uid'>)){ if($user->getByEmail($_POST<'email'>)){ return false; }else{ return true; } }else{ // 更改资料判断邮箱与其他人的邮箱是否相同 if($user->where("uid!={$_POST<'uid'>} and email='{$_POST<'email'>}'")->find()){ return false; }else{ return true; } } } } 当 checkEmail 方法返回 false 时,验证就不通过。可见 ThinkPHP 自动验证功能十分强大,能满足对表单的各种验证要求。
小旋风柴进 2019-12-02 02:02:35 0 浏览量 回答数 0

问题

数组2D不能订阅

Pycharm给出错误: print(' '.join(board[row]).replace('*', ' ')) TypeError: 'int' object is...
kun坤 2019-12-27 16:57:12 6 浏览量 回答数 1

问题

显示导致测试失败的阵列条目

作为测试套件的一部分,我必须检查函数返回的numpy数组是否正确。使用np.array_equal可以很容易地进行此检查,它返回一个布尔值,判断所有数组元素是否相同。 如果测试失败,...
is大龙 2020-03-23 16:46:07 0 浏览量 回答数 1

问题

AttributeError:'NoneType'对象没有属性'_jvm - PySpark UDF

我有杂志订阅的数据及其创建时间,以及包含与给定用户关联的所有订阅到期日期数组的列: user_id created_date expiration_dates_for_user 202394 ...
社区小助手 2019-12-01 19:29:09 1407 浏览量 回答数 1

回答

异常检测 我们的第一个任务是利用高斯模型判断数据集里未标记的例子是否应该被认为是异常的。我们可以在简单的二维数据集上开始,这样就可以很容易的看到算法如何工作。 加载数据并绘图 import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sb from scipy.io import loadmat %matplotlib inline data = loadmat('data/ex8data1.mat') X = data['X'] X.shape (307L,2L) fig, ax= plt.subplots(figsize=(12,8)) ax.scatter(X[:,0], X[:,1]) 在中心有一个非常紧密的集群,有几个值离集群比较远。在这个简单的例子中,这几个值可以被视为异常。为了找到原因,我们需要估算数据中每个特征的高斯分布。我们需要用均值和方差定义概率分布。为此,我们将创建一个简单的函数,计算数据集中每个特征的均值和方差。 def estimate_gaussian(X): mu = X.mean(axis=0) sigma = X.var(axis=0) return mu, sigma mu, sigma = estimate_gaussian(X) mu, sigma (array([14.11222578, 14.99771051]), array([1.83263141, 1.70974533])) 现在我们已经有了模型参数,接下来需要确定概率阈值,它用来表明例子应该是否被视为异常值。我们需要使用一系列标签验证数据(真正的异常数据已经被标出),并且在给出不同阈值的情况下测试模型的识别性能。 Xval = data['Xval'] yval = data['yval'] Xval.shape, yval.shape ((307L,2L), (307L,1L)) 我们还需要一种方法来计算数据点属于给定参数集正态分布的概率。幸运的是SciPy内置了这种方法。 from scipy import stats dist = stats.norm(mu[0], sigma[0]) dist.pdf(X[:,0])[0:50] array([ 0.183842 , 0.20221694, 0.21746136, 0.19778763, 0.20858956, 0.21652359, 0.16991291, 0.15123542, 0.1163989 , 0.1594734 , 0.21716057, 0.21760472, 0.20141857, 0.20157497, 0.21711385, 0.21758775, 0.21695576, 0.2138258 , 0.21057069, 0.1173018 , 0.20765108, 0.21717452, 0.19510663, 0.21702152, 0.17429399, 0.15413455, 0.21000109, 0.20223586, 0.21031898, 0.21313426, 0.16158946, 0.2170794 , 0.17825767, 0.17414633, 0.1264951 , 0.19723662, 0.14538809, 0.21766361, 0.21191386, 0.21729442, 0.21238912, 0.18799417, 0.21259798, 0.21752767, 0.20616968, 0.21520366, 0.1280081 , 0.21768113, 0.21539967, 0.16913173]) 在不清楚的情况下,我们只需要计算数据集第一维度的前50个实例的分布概率,这些概率是通过计算该维度的均值和方差得到的。 在我们计算的高斯模型参数中,计算并保存数据集中的每一个值的概率密度。 p = np.zeros((X.shape[0], X.shape[1])) p[:,0] = stats.norm(mu[0], sigma[0]).pdf(X[:,0]) p[:,1] = stats.norm(mu[1], sigma[1]).pdf(X[:,1]) p.shape (307L,2L) 我们还需要为验证集(使用相同的模型参数)执行此操作。我们将使用这些概率和真实标签来确定最优概率阈值,进而指定数据点作为异常值。 pval = np.zeros((Xval.shape[0], Xval.shape[1])) pval[:,0] = stats.norm(mu[0], sigma[0]).pdf(Xval[:,0]) pval[:,1] = stats.norm(mu[1], sigma[1]).pdf(Xval[:,1]) 接下来我们需要一个函数,在给出的概率密度和真实标签中找到最佳阈值。为了执行这步操作,我们需要计算不同值的epsilon的F1分数,F1是true positive, false positive和 false negative数值的函数。 def select_threshold(pval, yval): best_epsilon = 0 best_f1 = 0 f1 = 0 step = (pval.max() - pval.min()) / 1000 for epsilon in np.arange(pval.min(), pval.max(), step): preds = pval < epsilon tp = np.sum(np.logical_and(preds == 1, yval == 1)).astype(float) fp = np.sum(np.logical_and(preds == 1, yval == 0)).astype(float) fn = np.sum(np.logical_and(preds == 0, yval == 1)).astype(float) precision = tp / (tp + fp) recall = tp / (tp + fn) f1 = (2 * precision * recall) / (precision + recall) if f1 > best_f1: best_f1 = f1 best_epsilon = epsilon return best_epsilon, best_f1 epsilon, f1 = select_threshold(pval, yval) epsilon, f1 (0.0095667060059568421,0.7142857142857143) 最后我们在数据集上应用阈值,可视化结果。 # indexes of the values considered to be outliers outliers = np.where(p < epsilon) fig, ax = plt.subplots(figsize=(12,8)) ax.scatter(X[:,0], X[:,1]) ax.scatter(X[outliers[0],0], X[outliers[0],1], s=50, color='r', marker='o') 结果还不错,红色的点是被标记为离群值的点,视觉上这些看起来很合理。有一些分离(但没有被标记)的右上角的点也可能是一个离群值,但这是相当接近的。 协同过滤 推荐系统使用项目和基于用户的相似性计算,以检查用户的历史偏好,从而为用户推荐可能感兴趣的新“东西”。在这个练习中,我们将实现一种特殊的推荐算法,称为协同过滤,并将其应用于电影评分的数据集。首先加载并检查我们要处理的数据。 data= loadmat('data/ex8_movies.mat') data {'R': array([[1, 1, 0, ..., 1, 0, 0], [1, 0, 0, ..., 0, 0, 1], [1, 0, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], dtype=uint8), 'Y': array([[5, 4, 0, ..., 5, 0, 0], [3, 0, 0, ..., 0, 0, 5], [4, 0, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], dtype=uint8), '__globals__': [], '__header__': 'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Thu Dec 1 17:19:26 2011', '__version__': '1.0'} Y是一个包含从一到五的评分的数据组,R是一个包含二进制值的“指示器”数据组,它表明了用户是否对电影进行了评分。两者应该具有相同的形状。 Y = data['Y'] R = data['R'] Y.shape, R.shape ((1682L,943L), (1682L,943L)) 我们可以通过Y中的一行平均值来评估电影的平均评分。 Y[1,R[1,:]].mean() 2.5832449628844114 如果它是图片的话,我们也通过渲染矩阵来可视化数据。 fig, ax = plt.subplots(figsize=(12,12)) ax.imshow(Y) ax.set_xlabel('Users') ax.set_ylabel('Movies') fig.tight_layout() 接下来我们将实现协同过滤的成本函数。“成本”是指电影评分预测偏离真实预测的程度。在文本练习中,成本函数是给定的。它以文本中的X和Theta参数矩阵为基础,这些展开到参数输入中,以便使用SciPy的优化包。注意,我已经在注释中包含了数组/矩阵形状,以帮助说明矩阵交互如何工作。 def cost(params, Y, R, num_features): Y = np.matrix(Y) # (1682, 943) R = np.matrix(R) # (1682, 943) num_movies = Y.shape[0] num_users = Y.shape[1] # reshape the parameter array into parameter matrices X = np.matrix(np.reshape(params[:num_movies * num_features], (num_movies, num_features))) # (1682, 10) Theta = np.matrix(np.reshape(params[num_movies * num_features:], (num_users, num_features))) # (943, 10) # initializations J = 0 # compute the cost error = np.multiply((X * Theta.T) - Y, R) # (1682, 943) squared_error = np.power(error, 2) # (1682, 943) J = (1. / 2) * np.sum(squared_error) return J 我们提供一系列预先训练过并且我们可以评估的参数进行测试。为了减少评估时间,我们研究较小的子集。 users = 4 movies = 5 features = 3 params_data = loadmat('data/ex8_movieParams.mat') X = params_data['X'] Theta = params_data['Theta'] X_sub = X[:movies, :features] Theta_sub = Theta[:users, :features] Y_sub = Y[:movies, :users] R_sub = R[:movies, :users] params = np.concatenate((np.ravel(X_sub), np.ravel(Theta_sub))) cost(params, Y_sub, R_sub, features) 22.224603725685675 答案与练习文本中的结果匹配。接下来需要实现梯度计算,与练习四中神经网络的实现一样,我们会扩展成本函数计算梯度。 def cost(params, Y, R, num_features): Y = np.matrix(Y) # (1682, 943) R = np.matrix(R) # (1682, 943) num_movies = Y.shape[0] num_users = Y.shape[1] # reshape the parameter array into parameter matrices X = np.matrix(np.reshape(params[:num_movies * num_features], (num_movies, num_features))) # (1682, 10) Theta = np.matrix(np.reshape(params[num_movies * num_features:], (num_users, num_features))) # (943, 10) # initializations J = 0 X_grad = np.zeros(X.shape) # (1682, 10) Theta_grad = np.zeros(Theta.shape) # (943, 10) # compute the cost error = np.multiply((X * Theta.T) - Y, R) # (1682, 943) squared_error = np.power(error, 2) # (1682, 943) J = (1. / 2) * np.sum(squared_error) # calculate the gradients X_grad = error * Theta Theta_grad = error.T * X # unravel the gradient matrices into a single array grad = np.concatenate((np.ravel(X_grad), np.ravel(Theta_grad))) return J, grad J, grad = cost(params, Y_sub, R_sub, features) J, grad (22.224603725685675, array([ -2.52899165, 7.57570308, -1.89979026, -0.56819597, 3.35265031, -0.52339845, -0.83240713, 4.91163297, -0.76677878, -0.38358278, 2.26333698, -0.35334048, -0.80378006, 4.74271842, -0.74040871, -10.5680202 , 4.62776019, -7.16004443, -3.05099006, 1.16441367, -3.47410789, 0. , 0. , 0. , 0. , 0. , 0. ])) 下一步是在成本和梯度计算中添加正则化。最终会创建一个正则化版本的函数。(注意,这个版本包含一个额外的学习速率参数“lambda”) def cost(params, Y, R, num_features, learning_rate): Y = np.matrix(Y) # (1682, 943) R = np.matrix(R) # (1682, 943) num_movies = Y.shape[0] num_users = Y.shape[1] # reshape the parameter array into parameter matrices X = np.matrix(np.reshape(params[:num_movies * num_features], (num_movies, num_features))) # (1682, 10) Theta = np.matrix(np.reshape(params[num_movies * num_features:], (num_users, num_features))) # (943, 10) # initializations J = 0 X_grad = np.zeros(X.shape) # (1682, 10) Theta_grad = np.zeros(Theta.shape) # (943, 10) # compute the cost error = np.multiply((X * Theta.T) - Y, R) # (1682, 943) squared_error = np.power(error, 2) # (1682, 943) J = (1. / 2) * np.sum(squared_error) # add the cost regularization J = J + ((learning_rate / 2) * np.sum(np.power(Theta, 2))) J = J + ((learning_rate / 2) * np.sum(np.power(X, 2))) # calculate the gradients with regularization X_grad = (error * Theta) + (learning_rate * X) Theta_grad = (error.T * X) + (learning_rate * Theta) # unravel the gradient matrices into a single array grad = np.concatenate((np.ravel(X_grad), np.ravel(Theta_grad))) return J, grad J, grad = cost(params, Y_sub, R_sub, features, 1.5) J, grad (31.344056244274221, array([ -0.95596339, 6.97535514, -0.10861109, 0.60308088, 2.77421145, 0.25839822, 0.12985616, 4.0898522 , -0.89247334, 0.29684395, 1.06300933, 0.66738144, 0.60252677, 4.90185327, -0.19747928, -10.13985478, 2.10136256, -6.76563628, -2.29347024, 0.48244098, -2.99791422, -0.64787484, -0.71820673, 1.27006666, 1.09289758, -0.40784086, 0.49026541])) 结果与执行代码的预期输出匹配,看起来正则化是有效的。在训练模型之前,还有最后一步:创建我们自己的电影评分模型,这样我们就可以使用这个模型来生成个性化的建议。我们得到了一个将电影索引链接到其标题的文件。把文件加载到字典中,并使用练习文本提供的一些样本评分。 movie_idx = {} f = open('data/movie_ids.txt') for line in f: tokens = line.split(' ') tokens[-1] = tokens[-1][:-1] movie_idx[int(tokens[0]) - 1] = ' '.join(tokens[1:]) ratings = np.zeros((1682, 1)) ratings[0] = 4 ratings[6] = 3 ratings[11] = 5 ratings[53] = 4 ratings[63] = 5 ratings[65] = 3 ratings[68] = 5 ratings[97] = 2 ratings[182] = 4 ratings[225] = 5 ratings[354] = 5 print('Rated {0} with {1} stars.'.format(movie_idx[0], str(int(ratings[0])))) print('Rated {0} with {1} stars.'.format(movie_idx[6], str(int(ratings[6])))) print('Rated {0} with {1} stars.'.format(movie_idx[11], str(int(ratings[11])))) print('Rated {0} with {1} stars.'.format(movie_idx[53], str(int(ratings[53])))) print('Rated {0} with {1} stars.'.format(movie_idx[63], str(int(ratings[63])))) print('Rated {0} with {1} stars.'.format(movie_idx[65], str(int(ratings[65])))) print('Rated {0} with {1} stars.'.format(movie_idx[68], str(int(ratings[68])))) print('Rated {0} with {1} stars.'.format(movie_idx[97], str(int(ratings[97])))) print('Rated {0} with {1} stars.'.format(movie_idx[182], str(int(ratings[182])))) print('Rated {0} with {1} stars.'.format(movie_idx[225], str(int(ratings[225])))) print('Rated {0} with {1} stars.'.format(movie_idx[354], str(int(ratings[354])))) Rated Toy Story (1995) with 4 stars. Rated Twelve Monkeys (1995) with 3 stars. Rated Usual Suspects, The (1995) with 5 stars. Rated Outbreak (1995) with 4 stars. Rated Shawshank Redemption, The (1994) with 5 stars. Rated While You Were Sleeping (1995) with 3 stars. Rated Forrest Gump (1994) with 5 stars. Rated Silence of the Lambs, The (1991) with 2 stars. Rated Alien (1979) with 4 stars. Rated Die Hard 2 (1990) with 5 stars. Rated Sphere (1998) with 5 stars. 我们可以在数据集中添加自定义评分向量。 R = data['R'] Y = data['Y'] Y = np.append(Y, ratings, axis=1) R = np.append(R, ratings != 0, axis=1) 开始训练协同过滤模型,我们将通过成本函数、参数向量和输入的数据矩阵使评分正规化,并且运行优化程序。 from scipy.optimize import minimize movies = Y.shape[0] users = Y.shape[1] features = 10 learning_rate = 10. X = np.random.random(size=(movies, features)) Theta = np.random.random(size=(users, features)) params = np.concatenate((np.ravel(X), np.ravel(Theta))) Ymean = np.zeros((movies, 1)) Ynorm = np.zeros((movies, users)) for i in range(movies): idx = np.where(R[i,:] == 1)[0] Ymean[i] = Y[i,idx].mean() Ynorm[i,idx] = Y[i,idx] - Ymean[i] fmin = minimize(fun=cost, x0=params, args=(Ynorm, R, features, learning_rate), method='CG', jac=True, options={'maxiter': 100}) fmin status: 1 success: False njev: 149 nfev: 149 fun: 38953.88249706676 x: array([-0.07177334, -0.08315075, 0.1081135 , ..., 0.1817828 , 0.16873062, 0.03383596]) message: 'Maximum number of iterations has been exceeded.' jac: array([ 0.01833555, 0.07377974, 0.03999323, ..., -0.00970181, 0.00758961, -0.01181811]) 由于所有的优化程序都是“unrolled”,因此要正确地工作,需要将我们的矩阵重新调整回原来的维度。 X = np.matrix(np.reshape(fmin.x[:movies * features], (movies, features))) Theta = np.matrix(np.reshape(fmin.x[movies * features:], (users, features))) X.shape, Theta.shape ((1682L,10L), (944L,10L)) 我们训练过的参数有X和Theta,使用这些为我们以前添加的用户提供建议。 predictions = X * Theta.T my_preds = predictions[:, -1] + Ymean sorted_preds = np.sort(my_preds, axis=0)[::-1] sorted_preds[:10] matrix([[ 5.00000264], [ 5.00000249], [ 4.99999831], [ 4.99999671], [ 4.99999659], [ 4.99999253], [ 4.99999238], [ 4.9999915 ], [ 4.99999019], [ 4.99998643]] 这给了我们一个有序的最高评分名单,但我们失去了评分的索引。我们需要使用argsort来了解预测评分对应的电影。 idx = np.argsort(my_preds, axis=0)[::-1] print("Top 10 movie predictions:") for i in range(10): j = int(idx[i]) print('Predicted rating of {0} for movie {1}.'.format(str(float(my_preds[j])), movie_idx[j])) Top 10 movie predictions: Predicted rating of 5.00000264002 for movie Prefontaine (1997). Predicted rating of 5.00000249142 for movie Santa with Muscles (1996). Predicted rating of 4.99999831018 for movie Marlene Dietrich: Shadow and Light (1996) . Predicted rating of 4.9999967124 for movie Saint of Fort Washington, The (1993). Predicted rating of 4.99999658864 for movie They Made Me a Criminal (1939). Predicted rating of 4.999992533 for movie Someone Else's America (1995). Predicted rating of 4.99999238336 for movie Great Day in Harlem, A (1994). Predicted rating of 4.99999149604 for movie Star Kid (1997). Predicted rating of 4.99999018592 for movie Aiqing wansui (1994). Predicted rating of 4.99998642746 for movie Entertaining Angels: The Dorothy Day Story (1996). 实际上推荐的电影并没有很好地符合练习文本中的内容。原因不太清楚,我还没有找到任何可以解释的理由,在代码中可能有错误。不过,即使有一些细微的差别,这个例子的大部分也是准确的。
珍宝珠 2019-12-02 03:22:42 0 浏览量 回答数 0

回答

首先加载数据集。与前面的示例不同,我们的数据文件是MATLAB的本体格式,不能被pandas自动识别,所以把它加载在Python中需要使用SciPy utility。 import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.io import loadmat %matplotlib inline data = loadmat('data/ex3data1.mat') data {'X': array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], ..., [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]]), '__globals__': [], '__header__': 'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16 13:09:09 2011', '__version__': '1.0', 'y': array([[10], [10], [10], ..., [ 9], [ 9], [ 9]], dtype=uint8)} 快速检查加载到储存器中的矩阵的形状 data['X'].shape, data['y'].shape > ((5000L, 400L), (5000L, 1L)) 我们已经加载了我们的数据。图像在martix X 被表现为400维的向量。这400个“特征”是原始20×20图像中每个像素的灰度强度。类标签在向量y中表示图像中数字的数字类。下面的图片给出了一些数字的例子。每个带有白色手写数字的灰色框代表我们数据集中400维的行。 我们的第一个任务是修改逻辑回归的实现以完全向量化(即没有“for”循环),这是因为矢量化代码除了简洁扼要,还能够利用线性代数优化,并且比迭代代码快得多。我们在练习二中的成本函数实现已经向量化。所以我们在这里重复使用相同的实现。请注意,我们正在跳到最终的正则化版本。 def sigmoid(z): return 1 / (1 + np.exp(-z)) def cost(theta, X, y, learningRate): theta = np.matrix(theta) X = np.matrix(X) y = np.matrix(y) first = np.multiply(-y, np.log(sigmoid(X * theta.T))) second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T))) reg = (learningRate / 2 * len(X)) * np.sum(np.power(theta[:,1:theta.shape[1]], 2)) return np.sum(first - second) / (len(X)) + reg 这个成本函数与我们在先前练习中创建的函数是相同的,如果你不确定我们如何做到这一点,在运行之前查看以前的文章。 接下来,我们需要计算梯度的函数。我们已经在前面的练习中定义了它,我们在更新步骤中需要去掉“for”循环。这是可供参考的原始代码: def gradient_with_loop(theta, X, y, learningRate): theta = np.matrix(theta) X = np.matrix(X) y = np.matrix(y) parameters = int(theta.ravel().shape[1]) grad = np.zeros(parameters) error = sigmoid(X * theta.T) - y for i in range(parameters): term = np.multiply(error, X[:,i]) if (i == 0): grad[i] = np.sum(term) / len(X) else: grad[i] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:,i]) return grad 梯度函数详细的阐述了如何改变一个参数,以获得一个比之前好的答案。如果你不熟悉线性代数,这一系列运作背后的数学是很难理解的。 现在我们需要创建一个不使用任何循环的梯度函数的版本。在我们的新版本中,我们将去掉“for”循环,并使用线性代数(除了截距参数,它需要单独计算)计算每个参数的梯度。 还要注意,我们将数据结构转换为NumPy矩阵。这样做是为了使代码看起来更像Octave,而不是数组,这是因为矩阵自动遵循矩阵运算规则与element-wise运算。我们在下面的例子中使用矩阵。 def gradient(theta, X, y, learningRate): theta = np.matrix(theta) X = np.matrix(X) y = np.matrix(y) parameters = int(theta.ravel().shape[1]) error = sigmoid(X * theta.T) - y grad = ((X.T * error) / len(X)).T + ((learningRate / len(X)) * theta) # intercept gradient is not regularized grad[0, 0] = np.sum(np.multiply(error, X[:,0])) / len(X) return np.array(grad).ravel() 现在我们已经定义了成本和梯度函数,接下来创建一个分类器。对于本章练习的任务,我们有10个可能的分类,由于逻辑回归一次只能区分两个类别,我们需要一个方法去处理多类别场景。在这个练习中我们的任务是实现一对多的分类,其中k个不同类的标签导致了k个分类器,每个分类器在“class i”和“not class i”之间做决定。我们将在一个函数中完成分类器的训练,计算10个分类器的最终权重,并将权重返回作为k X(n + 1)数组,其中n是参数数。 from scipy.optimize import minimize def one_vs_all(X, y, num_labels, learning_rate): rows = X.shape[0] params = X.shape[1] # k X (n + 1) array for the parameters of each of the k classifiers all_theta = np.zeros((num_labels, params + 1)) # insert a column of ones at the beginning for the intercept term X = np.insert(X, 0, values=np.ones(rows), axis=1) # labels are 1-indexed instead of 0-indexed for i in range(1, num_labels + 1): theta = np.zeros(params + 1) y_i = np.array([1 if label == i else 0 for label in y]) y_i = np.reshape(y_i, (rows, 1)) # minimize the objective function fmin = minimize(fun=cost, x0=theta, args=(X, y_i, learning_rate), method='TNC', jac=gradient) all_theta[i-1,:] = fmin.x return all_theta 这里需要注意的几点:首先,我们为theta添加了一个额外的参数(带有一列训练数据)以计算截距项。其次,我们将y从类标签转换为每个分类器的二进制值(要么是I类,要么不是I类)。最后,我们使用SciPy的较新优化API来最小化每个分类器的成本函数。API利用目标函数、初始参数集、优化方法和jacobian(梯度)函数,将优化程序找到的参数分配给参数数组。 实现向量化代码的一个更具挑战性的部分是正确地写入所有的矩阵交互,所以通过查看正在使用的数组/矩阵的形状来做一些健全性检查是有用的,我们来看看上面的函数中使用的一些数据结构。 rows = data['X'].shape[0] params = data['X'].shape[1] all_theta = np.zeros((10, params + 1)) X = np.insert(data['X'], 0, values=np.ones(rows), axis=1) theta = np.zeros(params + 1) y_0 = np.array([1 if label == 0 else 0 for label in data['y']]) y_0 = np.reshape(y_0, (rows, 1)) X.shape, y_0.shape, theta.shape, all_theta.shape > ((5000L, 401L), (5000L, 1L), (401L,), (10L, 401L)) 注意,theta是一维数组,所以当它被转换为计算梯度的代码中的矩阵时,它变成一个(1×401)矩阵。 我们还要检查y中的类标签,以确保它们看起来像我们期望的。 np.unique(data['y']) > array([1, 2, 3, 4, 5, 6, 7, 8, 9,10], dtype=uint8) 确保函数正常运行,并获得一些合理的输出。 all_theta= one_vs_all(data['X'], data['y'],10,1) all_theta array([[ -5.79312170e+00, 0.00000000e+00, 0.00000000e+00, ..., 1.22140973e-02, 2.88611969e-07, 0.00000000e+00], [ -4.91685285e+00, 0.00000000e+00, 0.00000000e+00, ..., 2.40449128e-01, -1.08488270e-02, 0.00000000e+00], [ -8.56840371e+00, 0.00000000e+00, 0.00000000e+00, ..., -2.59241796e-04, -1.12756844e-06, 0.00000000e+00], ..., [ -1.32641613e+01, 0.00000000e+00, 0.00000000e+00, ..., -5.63659404e+00, 6.50939114e-01, 0.00000000e+00], [ -8.55392716e+00, 0.00000000e+00, 0.00000000e+00, ..., -2.01206880e-01, 9.61930149e-03, 0.00000000e+00], [ -1.29807876e+01, 0.00000000e+00, 0.00000000e+00, ..., 2.60651472e-04, 4.22693052e-05, 0.00000000e+00]]) 最后一步是使用训练过的分类器预测每个图像的标签。对于这一步骤,对于每个训练实例(使用矢量化代码),我们将计算每个类的类概率,并将输出类标签分配给具有最高概率的类。 def predict_all(X, all_theta): rows = X.shape[0] params = X.shape[1] num_labels = all_theta.shape[0] # same as before, insert ones to match the shape X = np.insert(X, 0, values=np.ones(rows), axis=1) # convert to matrices X = np.matrix(X) all_theta = np.matrix(all_theta) # compute the class probability for each class on each training instance h = sigmoid(X * all_theta.T) # create array of the index with the maximum probability h_argmax = np.argmax(h, axis=1) # because our array was zero-indexed we need to add one for the true label prediction h_argmax = h_argmax + 1 return h_argmax 现在我们可以使用predict_all函数为每个实例生成类预测,并了解分类器的工作情况。 y_pred = predict_all(data['X'], all_theta) correct = [1 if a == b else 0 for (a, b) in zip(y_pred, data['y'])] accuracy = (sum(map(int, correct)) / float(len(correct))) print 'accuracy = {0}%'.format(accuracy * 100) > accuracy = 97.58% 接近98%,相当不错,逻辑回归是一个相对简单的方法。 可参考:http://www.atyun.com/4260.html
珍宝珠 2019-12-02 03:22:33 0 浏览量 回答数 0

问题

数独益智游戏,内含正方形数字

两天前,我遇到了一个数独问题,尝试使用Python 3解决。我被告知确实存在一个解决方案,但是我不确定是否存在多个解决方案。 问题如下:数独的9x9网格完全为空。但是,...
is大龙 2020-03-23 20:21:05 5 浏览量 回答数 1

回答

TLDR :重要的是,问题是在二维*中设置的。对于大尺寸,这些技术可能无效。 在2D中,我们可以在'O(n log n)`时间内计算每个簇的直径(簇间距离),其中'n'是使用凸包的簇大小。向量化用于加快剩余操作的速度。文章结尾提到了两种可能的渐近改进,欢迎贡献;) *设置和伪造数据: import numpy as np from scipy import spatial from matplotlib import pyplot as plt # set up fake data np.random.seed(0) n_centroids = 1000 centroids = np.random.rand(n_centroids, 2) cluster_sizes = np.random.randint(1, 1000, size=n_centroids) # labels from 1 to n_centroids inclusive labels = np.repeat(np.arange(n_centroids), cluster_sizes) + 1 points = np.zeros((cluster_sizes.sum(), 2)) points[:,0] = np.repeat(centroids[:,0], cluster_sizes) points[:,1] = np.repeat(centroids[:,1], cluster_sizes) points += 0.05 * np.random.randn(cluster_sizes.sum(), 2) 看起来有点像这样: 接下来,基于使用凸包的方法,我们定义一个“直径”函数,用于计算最大簇内距离。 # compute the diameter based on convex hull def diameter(pts): # need at least 3 points to construct the convex hull if pts.shape[0] <= 1: return 0 if pts.shape[0] == 2: return ((pts[0] - pts[1])\*2).sum() # two points which are fruthest apart will occur as vertices of the convex hull hull = spatial.ConvexHull(pts) candidates = pts[spatial.ConvexHull(pts).vertices] return spatial.distance_matrix(candidates, candidates).max() 对于Dunn指数计算,我假设我们已经计算了点,聚类标签和聚类质心。 如果群集数量很大,则以下基于Pandas的解决方案可能会表现良好: import pandas as pd def dunn_index_pandas(pts, labels, centroids): # O(k n log(n)) with k clusters and n points; better performance with more even clusters max_intracluster_dist = pd.DataFrame(pts).groupby(labels).agg(diameter_pandas)[0].max() # O(k^2) with k clusters; can be reduced to O(k log(k)) # get pairwise distances between centroids cluster_dmat = spatial.distance_matrix(centroids, centroids) # fill diagonal with +inf: ignore zero distance to self in "min" computation np.fill_diagonal(cluster_dmat, np.inf) min_intercluster_dist = cluster_sizes.min() return min_intercluster_dist / max_intracluster_dist 否则,我们可以继续使用纯粹的numpy解决方案。 def dunn_index(pts, labels, centroids): # O(k n log(n)) with k clusters and n points; better performance with more even clusters max_intracluster_dist = max(diameter(pts[labels==i]) for i in np.unique(labels)) # O(k^2) with k clusters; can be reduced to O(k log(k)) # get pairwise distances between centroids cluster_dmat = spatial.distance_matrix(centroids, centroids) # fill diagonal with +inf: ignore zero distance to self in "min" computation np.fill_diagonal(cluster_dmat, np.inf) min_intercluster_dist = cluster_sizes.min() return min_intercluster_dist / max_intracluster_dist %time dunn_index(points, labels, centroids) # returned value 2.15 # in 2.2 seconds %time dunn_index_pandas(points, labels, centroids) # returned 2.15 # in 885 ms 对于iid〜U [1,1000]集群大小的1000集群,这需要2.2。秒在我的机器上。在本例中,使用Pandas方法时,此数字下降到0.8秒(许多小集群)。 当集群数量很大时,还有两个其他相关的优化机会: First, I am computing the minimal intercluster distance with a brute force ` O(k^2) ` approach where ` k ` is the number of clusters. This can be reduced to ` O(k log(k)) ` , as discussed here. Second, ` max(diameter(pts[labels==i]) for i in np.unique(labels)) ` requires ` k ` passes over an array of size ` n ` . With many clusters this can become the bottleneck (as in this example). This is somewhat mitigated with the pandas approach, but I expect that this can be optimized a lot further. For current parameters, roughly one third of compute time is spent outside of computing intercluser of intracluster distances. 回答来源:stackoverflow
is大龙 2020-03-23 23:55:18 0 浏览量 回答数 0

问题

Python:在Postgres中插入大型dataframe (1.2M行)的问题

我使用panda插入方法(https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-sql-method)来获取从.mdb文件转换而来的csv并将它们插入到Post...
kun坤 2019-12-30 09:34:45 0 浏览量 回答数 0

回答

K-Means聚类 首先,我们在一个简单的二维数据集上实现并应用k-means,以了解它如何工作。k-means是一种迭代的、无监督的聚类算法,它将类似的实例组合成集群。该算法通过猜测每个集群的初始centroid,反复向最近的集群分配实例,并重新计算该集群的centroid。首先我们要实现一个函数,它为数据中的每个实例找到最接近的centroid。 import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sb from scipy.io import loadmat %matplotlib inline def find_closest_centroids(X, centroids): m = X.shape[0] k = centroids.shape[0] idx = np.zeros(m) for i in range(m): min_dist = 1000000 for j in range(k): dist = np.sum((X[i,:] - centroids[j,:]) ** 2) if dist < min_dist: min_dist = dist idx[i] = j return idx 测试函数确保它像预期的那样工作,我们使用练习中的测试案例。 data = loadmat('data/ex7data2.mat') X = data['X'] initial_centroids = initial_centroids = np.array([[3, 3], [6, 2], [8, 5]]) idx = find_closest_centroids(X, initial_centroids) idx[0:3] array([0., 2., 1.]) 输出与文本中的预期值相匹配(我们的数组是zero-indexed而不是one-indexed,所以值比练习中的值要低1)。接下来,我们需要一个函数来计算集群的centroid。centroid是当前分配给集群的所有例子的平均值。 def compute_centroids(X, idx, k): m, n = X.shape centroids = np.zeros((k, n)) for i in range(k): indices = np.where(idx == i) centroids[i,:] = (np.sum(X[indices,:], axis=1) / len(indices[0])).ravel() return centroids compute_centroids(X, idx, 3) array([[ 2.42830111, 3.15792418], [ 5.81350331, 2.63365645], [ 7.11938687, 3.6166844 ]]) 此输出也与该练习的预期值相匹配。目前为止一切都很顺利。下一部分涉及到实际运行算法的迭代次数和可视化结果。我们在练习中实现了这一步骤,它没有那么复杂,我将从头开始构建它。为了运行这个算法,我们只需要在分配到最近集群的示例和重新计算集群的centroids之间进行交替操作。 def run_k_means(X, initial_centroids, max_iters): m, n = X.shape k = initial_centroids.shape[0] idx = np.zeros(m) centroids = initial_centroids for i in range(max_iters): idx = find_closest_centroids(X, centroids) centroids = compute_centroids(X, idx, k) return idx, centroids idx, centroids = run_k_means(X, initial_centroids, 10) 我们现在可以使用颜色编码表示集群成员。 cluster1 = X[np.where(idx == 0)[0],:] cluster2 = X[np.where(idx == 1)[0],:] cluster3 = X[np.where(idx == 2)[0],:] fig, ax = plt.subplots(figsize=(12,8)) ax.scatter(cluster1[:,0], cluster1[:,1], s=30, color='r', label='Cluster 1') ax.scatter(cluster2[:,0], cluster2[:,1], s=30, color='g', label='Cluster 2') ax.scatter(cluster3[:,0], cluster3[:,1], s=30, color='b', label='Cluster 3') ax.legend() 我们跳过了初始化centroid的过程。这可能会影响算法的收敛性。 接下来创建一个可以选择随机例子的函数,并将这些例子作为初始的centroid。 def init_centroids(X, k): m, n = X.shape centroids = np.zeros((k, n)) idx = np.random.randint(0, m, k) for i in range(k): centroids[i,:] = X[idx[i],:] return centroids init_centroids(X, 3) array([[ 1.15354031, 4.67866717], [ 6.27376271, 2.24256036], [ 2.20960296, 4.91469264]]) 我们的下一任务是应用K-means实现图像压缩。我们可以使用集群来查找图像中最具有代表性的少量的颜色,并使用集群分配将原来的24位颜色映射到一个低维度的颜色空间。这是我们要压缩的图像。 原始像素数据已经预加载了,把它输入进来。 image_data= loadmat('data/bird_small.mat') image_data {'A': array([[[219, 180, 103], [230, 185, 116], [226, 186, 110], ..., [ 14, 15, 13], [ 13, 15, 12], [ 12, 14, 12]], ..., [[ 15, 19, 19], [ 20, 20, 18], [ 18, 19, 17], ..., [ 65, 43, 39], [ 58, 37, 38], [ 52, 39, 34]]], dtype=uint8), '__globals__': [], '__header__': 'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Tue Jun 5 04:06:24 2012', '__version__': '1.0'} 我们可以快速查看数据的形状,以验证它是否像我们预期的图像。 A= image_data['A'] A.shape (128L,128L,3L) 现在我们需要对数据进行预处理,并将它输入到k-means算法中。 # normalize value ranges A = A / 255. # reshape the array X = np.reshape(A, (A.shape[0] * A.shape[1], A.shape[2])) # randomly initialize the centroids initial_centroids = init_centroids(X, 16) # run the algorithm idx, centroids = run_k_means(X, initial_centroids, 10) # get the closest centroids one last time idx = find_closest_centroids(X, centroids) # map each pixel to the centroid value X_recovered = centroids[idx.astype(int),:] # reshape to the original dimensions X_recovered = np.reshape(X_recovered, (A.shape[0], A.shape[1], A.shape[2])) plt.imshow(X_recovered) 我们在压缩中创建了一些artifact,尽管将原始图像映射到仅16种颜色,但图像的主要特征仍然存在。 这是关于k-means的部分,接下来我们来看关于主成分分析的部分。 主成分分析 PCA是一个可以在数据集中找到“主成分”或者最大方差方向的线性变换。它可以用于其他事物的维度减少。在这个练习中,我们需要实现PCA,并将其应用于一个简单的二维数据集,观察它是如何工作的。从加载和可视化数据集开始。 data = loadmat('data/ex7data1.mat') X = data['X'] fig, ax = plt.subplots(figsize=(12,8)) ax.scatter(X[:, 0], X[:, 1]) PCA的算法相当简单。在保证数据正规化后,输出只是原始数据协方差矩阵的单值分解。由于numpy已经有内置函数来计算矩阵协方差和SVD,我们将利用这些函数而不是从头开始。 def pca(X): # normalize the features X = (X - X.mean()) / X.std() # compute the covariance matrix X = np.matrix(X) cov = (X.T * X) / X.shape[0] # perform SVD U, S, V = np.linalg.svd(cov) return U, S, V U, S, V = pca(X) U, S, V (matrix([[-0.79241747, -0.60997914], [-0.60997914, 0.79241747]]), array([ 1.43584536, 0.56415464]), matrix([[-0.79241747, -0.60997914], [-0.60997914, 0.79241747]])) 现在我们已经有了主成分(矩阵U),我们可以利用它把原始数据投入到一个更低维度的空间,对于这个任务,我们将实现一个函数,它计算投影并只选择顶部K成分,有效地减少了维度的数量。 def project_data(X, U, k): U_reduced = U[:,:k] return np.dot(X, U_reduced) Z = project_data(X, U, 1) Z matrix([[-4.74689738], [-7.15889408], [-4.79563345], [-4.45754509], [-4.80263579], ..., [-6.44590096], [-2.69118076], [-4.61386195], [-5.88236227], [-7.76732508]]) 我们也可以通过改变采取的步骤来恢复原始数据。 def recover_data(Z, U, k): U_reduced = U[:,:k] return np.dot(Z, U_reduced.T) X_recovered = recover_data(Z, U, 1) X_recovered matrix([[ 3.76152442, 2.89550838], [ 5.67283275, 4.36677606], [ 3.80014373, 2.92523637], [ 3.53223661, 2.71900952], [ 3.80569251, 2.92950765], ..., [ 5.10784454, 3.93186513], [ 2.13253865, 1.64156413], [ 3.65610482, 2.81435955], [ 4.66128664, 3.58811828], [ 6.1549641 , 4.73790627]]) 如果我们尝试去可视化恢复的数据,会很容易的发现算法的工作原理。 fig, ax= plt.subplots(figsize=(12,8)) ax.scatter(X_recovered[:,0], X_recovered[:,1]) 注意这些点如何被压缩成一条虚线。虚线本质上是第一个主成分。当我们将数据减少到一个维度时,我们切断的第二个主成分可以被认为是与这条虚线的正交变化。由于我们失去了这些信息,我们的重建只能将这些点与第一个主成分相关联。 我们这次练习的最后一项任务是将PCA应用于脸部图像。通过使用相同降维技术,我们可以使用比原始图像少得多的数据来捕捉图像的“本质”。 faces= loadmat('data/ex7faces.mat') X= faces['X'] X.shape (5000L,1024L) 该练习代码包含一个函数,它将在网格中的数据集中渲染前100个脸部图像。你可以在练习文本中找到它们,不需要重新生成。 face= np.reshape(X[3,:], (32,32)) plt.imshow(face) 只有32 x 32灰度图像。下一步我们要在脸部图像数据集上运行PCA,并取得前100个主成分。 U, S, V= pca(X) Z= project_data(X, U,100) 现在尝试恢复原来的结构并重新渲染它。 X_recovered= recover_data(Z, U,100) face= np.reshape(X_recovered[3,:], (32,32)) plt.imshow(face) 结果并没有像预期的维度数量减少10倍,可能是因为我们丢失了一些细节部分。
珍宝珠 2019-12-02 03:22:40 0 浏览量 回答数 0

回答

给你个例子参考:1、前端,创建一个index.html <form action="upload_file.php" method="post" enctype="multipart/form-data" target="hidden_frame"> <label for="file">Filename:</label> <input type="file" name="file" id="file"><br> <input type="submit" name="submit" value="Submit"> <iframe name='hidden_frame' id="hidden_frame" style='display:none'></iframe> </form> <script type="text/javascript"> function callback(code,info){ console.log(code+info); } </script> 2、后端,创建一个upload_file.php $url = "/uploads/"; // 存储路径 $size = 2000000; // 文件大小 $allowedExts = array("gif", "jpeg", "jpg", "png","csv"); // 可上传的后缀名 $temp = explode(".", $_FILES["file"]["name"]); $extension = end($temp); if ( ($_FILES["file"]["size"] < $size) && in_array($extension, $allowedExts) ){ if ($_FILES["file"]["error"] > 0){ $return =array('code'=> '1','info'=> "Return Code: " . $_FILES["file"]["error"]); } else{ $newfile_url = $url.Date('YmdHis').'_'.$_FILES["file"]["name"]; move_uploaded_file($_FILES["file"]["tmp_name"],$newfile_url); $return =array('code'=> '1','info'=> "Stored in: " . $newfile_url); } } else{ $return =array('code'=> '0','info'=> 'Invalid File'); } echo "<script>parent.callback('".$return['code']."','".$return['info']."')</script>"; 好的我们来一一解析,这里我们看到前端提交是使用了 target="hidden_frame"这个主要是指定hidden_frame对象来提交操作,避免页面刷新;后台最后输出一个js脚本: echo "<script>parent.callback('".$return['code']."','".$return['info']."')</script>"; 这个主要是反馈回去调前端的一个callback函数: <script type="text/javascript"> function callback(code,info){ console.log(code+info); } </script> 这样你可以再callback的函数里面写任何东西了,后台也可以传任何东西到前端来。
蛮大人123 2019-12-02 01:57:24 0 浏览量 回答数 0

回答

PHP 5.6引入了第三个参数array_filter(),flag,你可以设置为ARRAY_FILTER_USE_KEY通过键,而不是值进行筛选: $my_array = ['foo' => 1, 'hello' => 'world']; $allowed = ['foo', 'bar']; $filtered = array_filter( $my_array, function ($key) use ($allowed) { return in_array($key, $allowed); }, ARRAY_FILTER_USE_KEY ); 显然,这不如优雅array_intersect_key($my_array, array_flip($allowed)),但是它确实提供了额外的灵活性,$allowed可以对键执行任意测试,例如可以包含正则表达式模式而不是纯字符串。 您还可以ARRAY_FILTER_USE_BOTH将值和键都传递给过滤器函数。这是一个基于第一个示例的示例,但请注意,我不建议您使用$allowed这种方式编码过滤规则: $my_array = ['foo' => 1, 'bar' => 'baz', 'hello' => 'wld']; $allowed = ['foo' => true, 'bar' => true, 'hello' => 'world']; $filtered = array_filter( $my_array, function ($val, $key) use ($allowed) { // N.b. $val, $key not $key, $val return isset($allowed[$key]) && ( $allowed[$key] === true || $allowed[$key] === $val ); }, ARRAY_FILTER_USE_BOTH ); // ['foo' => 1, 'bar' => 'baz'] 问题来源于stack overflow
保持可爱mmm 2020-01-15 16:32:07 0 浏览量 回答数 0

回答

import scipy.stats as stimport matplotlib.pyplot as pltimport numpy as npimport collectionsfrom sklearn.preprocessing import MinMaxScalerimport numpy as npimport csvimport mathfrom pylab import*import matplotlib.mlab as mlabfrom sklearn.utils import shuffleimport mathi=0j=[]data = []X = []indicess = []xback =24with open(r'D:error01冬季雨天.csv') as f: reader = csv.reader(f) for row in reader: data.append(row[:])#提取出每一行中的2:14列 data1=[]data = np.array(data)m,n=np.shape(data)for i in range(m): for j in range(n): #print(data[i][j]) data[i][j] = data[i][j].astype('float64')#是从第三列开始的 for i in range(m): for j in range(n): #print(data[i][j]) data1.append(data[i][j]) print("the type of data1",type(data1[1]))data = data.astype('float64') print(data) print("the shape of data",len(data)) 定义最大似然函数后的结果 def mle(x): u = np.mean(x) thea=np.std(x) return u,thea 确定了分布 print(mle(data))u,thea=mle(data)print(u)print(thea)y = st.norm.pdf(data[:6],u,thea)print(y)count, bins, ignored =plt.hist(data,bins=20,normed=False)print("count",len(count))print("bins",len(bins))plt.plot(bins[:20],count,"r")pro=count/np.sum(count)plt.xlabel("x")plt.ylabel("probability density")plt.show() plt.plot(bins[:20],pro,"r",lw=2)plt.show()low=-1.65*thea+u #对应90%的置信度up=1.65*thea+udata0=[]print("下界为",low)print("上界为:",up) with open(r'D:真实值冬季雨天.csv') as f: reader = csv.reader(f) for row in reader: data0.append(row[:]) # 提取出每一行中的2:14列 data01=[]data0 = np.array(data0) print(data0) m,n=np.shape(data0)print("the shape of data0",np.shape(data0))for i in range(m): for j in range(n): #print(data0[i][j]) data0[i][j] = data0[i][j].astype('float64')#是从第三列开始的 for i in range(m): for j in range(n): #print(data[i][j]) data01.append(data0[i][j]) print("the type of data1",type(data1[1])) data0 = data0.astype('float64')data01=map(eval, data01)print(np.shape(data0))print(data0[:4])print(data0[:2,0])datamax=np.max(data0[:,0])datamax=np.max(data0[:,0])p_low = list(map(lambda x: (x-abs(low)*datamax) , data0[:,0]))p_up = list(map(lambda x: (x+up *datamax), data0[:,1]))x=[i for i in range(len(p_low))]print(x) 显示置信区间范围 l=90k=0plt.plot(x[k:l],p_low[k:l], 'g', lw=2, label='下界曲线')plt.plot(x[k:l],p_up[k:l], 'g', lw=2, label='上界曲线')plt.plot(x[k:l],data0[k:l,0], 'b', lw=2, label='真实值')plt.plot(data0[k:l,1], 'r', lw=2, label='预测值')plt.fill_between(x[k:l],p_low[k:l],p_up[k:l],color="c",alpha=0.1)plt.title('置信区间', fontsize=18) # 表的名称plt.legend(loc=0, numpoints=1)leg = plt.gca().get_legend()ltext = leg.get_texts()plt.setp(ltext, fontsize='small') 负责绘制与图或轴相关的数据 savefig('D:/十折交叉验证/LSTM1.jpg') plt.show() 评价置信区间PICP,PINAW,CWC,PICP用来评价预测区间的覆盖率,PINAW预测区间的宽带 count=0 for i in range(len(p_low)): if data0[i][1]>=p_low[i] and data0[i][1]<=p_up[i]: count=count+1 PICP = count/len(p_low)print("PICP",PICP) 对于概率性的区间预测方法,在置信度一样的情况下,预测区间越窄越好 max0=np.max(data0[:,1])min0=np.min(data0[:,1])sum0=list(map(lambda x: (x[1]-x[0]) , zip(p_low,p_up)))sum1=np.sum(sum0)/len(sum0)PINAW = 1/(max0-min0)*sum1print("PINAW",PINAW) 综合指标的评价cwcCWC = PINAW(1+R(PICP)np.exp(-y(PICP-U))) g=90#取值在50-100e0=math.exp(-g*(PICP-u))if PICP>=u: r=0 else: r=1 CWC=PINAW(1+rPICP*e0)print("CWC",CWC)
xuning715 2019-12-02 01:10:12 0 浏览量 回答数 0

回答

在云栖社区的问答区,有一位网友提到有一个问题: 表里相似数据太多,想删除相似度高的数据,有什么办法能实现吗? 例如: 银屑病怎么治? 银屑病怎么治疗? 银屑病怎么治疗好? 银屑病怎么能治疗好? 等等 解这个问题的思路 .1. 首先如何判断内容的相似度,PostgreSQL中提供了中文分词,pg_trgm(将字符串切成多个不重复的token,计算两个字符串的相似度) . 对于本题,我建议采取中文分词的方式,首先将内容拆分成词组。 .2. 在拆分成词组后,首先分组聚合,去除完全重复的数据。 .3. 然后自关联生成笛卡尔(矩阵),计算出每条记录和其他记录的相似度。相似度的算法很简单,重叠的token数量除以集合的token去重后的数量。 .4. 根据相似度,去除不需要的数据。 这里如果数据量非常庞大,使用专业的分析编程语言会更好例如 PL/R。 实操的例子: 首先要安装PostgreSQL 中文分词插件 (阿里云AliCloudDB PostgreSQL已包含这个插件,用法参考官方手册) git clone https://github.com/jaiminpan/pg_jieba.git mv pg_jieba $PGSRC/contrib/ export PATH=/home/digoal/pgsql9.5/bin:$PATH cd $PGSRC/contrib/pg_jieba make clean;make;make install git clone https://github.com/jaiminpan/pg_scws.git mv pg_jieba $PGSRC/contrib/ export PATH=/home/digoal/pgsql9.5/bin:$PATH cd $PGSRC/contrib/pg_scws make clean;make;make install 创建插件 psql # create extension pg_jieba; # create extension pg_scws; 创建测试CASE create table tdup1 (id int primary key, info text); create extension pg_trgm; insert into tdup1 values (1, '银屑病怎么治?'); insert into tdup1 values (2, '银屑病怎么治疗?'); insert into tdup1 values (3, '银屑病怎么治疗好?'); insert into tdup1 values (4, '银屑病怎么能治疗好?'); 这两种分词插件,可以任选一种。 postgres=# select to_tsvector('jiebacfg', info),* from tdup1 ; to_tsvector | id | info ---------------------+----+---------------------- '治':3 '银屑病':1 | 1 | 银屑病怎么治? '治疗':3 '银屑病':1 | 2 | 银屑病怎么治疗? '治疗':3 '银屑病':1 | 3 | 银屑病怎么治疗好? '治疗':4 '银屑病':1 | 4 | 银屑病怎么能治疗好? (4 rows) postgres=# select to_tsvector('scwscfg', info),* from tdup1 ; to_tsvector | id | info -----------------------------------+----+---------------------- '治':2 '银屑病':1 | 1 | 银屑病怎么治? '治疗':2 '银屑病':1 | 2 | 银屑病怎么治疗? '好':3 '治疗':2 '银屑病':1 | 3 | 银屑病怎么治疗好? '好':4 '治疗':3 '能':2 '银屑病':1 | 4 | 银屑病怎么能治疗好? (4 rows) 创建三个函数, 计算2个数组的集合(去重后的集合) postgres=# create or replace function array_union(text[], text[]) returns text[] as $$ select array_agg(c1) from (select c1 from unnest($1||$2) t(c1) group by c1) t; $$ language sql strict; CREATE FUNCTION 数组去重 postgres=# create or replace function array_dist(text[]) returns text[] as $$ select array_agg(c1) from (select c1 from unnest($1) t(c1) group by c1) t; $$ language sql strict; CREATE FUNCTION 计算两个数组的重叠部分(去重后的重叠部分) postgres=# create or replace function array_share(text[], text[]) returns text[] as $$ select array_agg(unnest) from (select unnest($1) intersect select unnest($2) group by 1) t; $$ language sql strict; CREATE FUNCTION 笛卡尔结果是这样的: regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:d+)', '', 'g')),' ') 用于将info转换成数组。 postgres=# with t(c1,c2,c3) as (select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1) select * from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2) simulate from t t1,t t2) t; t1c1 | t2c1 | t1c2 | t2c2 | t1c3 | t2c3 | simulate ------+------+----------------------+----------------------+-------------------+-------------------+---------- 1 | 1 | 银屑病怎么治? | 银屑病怎么治? | {'银屑病','治'} | {'银屑病','治'} | 1.00 1 | 2 | 银屑病怎么治? | 银屑病怎么治疗? | {'银屑病','治'} | {'银屑病','治疗'} | 0.33 1 | 3 | 银屑病怎么治? | 银屑病怎么治疗好? | {'银屑病','治'} | {'银屑病','治疗'} | 0.33 1 | 4 | 银屑病怎么治? | 银屑病怎么能治疗好? | {'银屑病','治'} | {'银屑病','治疗'} | 0.33 2 | 1 | 银屑病怎么治疗? | 银屑病怎么治? | {'银屑病','治疗'} | {'银屑病','治'} | 0.33 2 | 2 | 银屑病怎么治疗? | 银屑病怎么治疗? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 2 | 3 | 银屑病怎么治疗? | 银屑病怎么治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 2 | 4 | 银屑病怎么治疗? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 3 | 1 | 银屑病怎么治疗好? | 银屑病怎么治? | {'银屑病','治疗'} | {'银屑病','治'} | 0.33 3 | 2 | 银屑病怎么治疗好? | 银屑病怎么治疗? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 3 | 3 | 银屑病怎么治疗好? | 银屑病怎么治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 3 | 4 | 银屑病怎么治疗好? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 4 | 1 | 银屑病怎么能治疗好? | 银屑病怎么治? | {'银屑病','治疗'} | {'银屑病','治'} | 0.33 4 | 2 | 银屑病怎么能治疗好? | 银屑病怎么治疗? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 4 | 3 | 银屑病怎么能治疗好? | 银屑病怎么治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 4 | 4 | 银屑病怎么能治疗好? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 (16 rows) 以上生成的实际上是一个矩阵,simulate就是矩阵中我们需要计算的相似度: 我们在去重计算时不需要所有的笛卡尔积,只需要这个矩阵对角线的上部分或下部分数据即可。 所以加个条件就能完成。 postgres=# with t(c1,c2,c3) as (select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1) select * from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2) simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t; t1c1 | t2c1 | t1c2 | t2c2 | t1c3 | t2c3 | simulate ------+------+--------------------+----------------------+-------------------+-------------------+---------- 1 | 2 | 银屑病怎么治? | 银屑病怎么治疗? | {'银屑病','治'} | {'银屑病','治疗'} | 0.33 1 | 3 | 银屑病怎么治? | 银屑病怎么治疗好? | {'银屑病','治'} | {'银屑病','治疗'} | 0.33 1 | 4 | 银屑病怎么治? | 银屑病怎么能治疗好? | {'银屑病','治'} | {'银屑病','治疗'} | 0.33 2 | 3 | 银屑病怎么治疗? | 银屑病怎么治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 2 | 4 | 银屑病怎么治疗? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 3 | 4 | 银屑病怎么治疗好? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 (6 rows) 开始对这些数据去重,去重的第一步,明确simulate, 例如相似度大于0.5的,需要去重。 postgres=# with t(c1,c2,c3) as (select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1) select * from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2) simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t where simulate>0.5; t1c1 | t2c1 | t1c2 | t2c2 | t1c3 | t2c3 | simulate ------+------+--------------------+----------------------+-------------------+-------------------+---------- 2 | 3 | 银屑病怎么治疗? | 银屑病怎么治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 2 | 4 | 银屑病怎么治疗? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 3 | 4 | 银屑病怎么治疗好? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 (3 rows) 去重第二步,将t2c1列的ID对应的记录删掉即可。 delete from tdup1 where id in (with t(c1,c2,c3) as (select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1) select t2c1 from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2) simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t where simulate>0.5); 例如 : postgres=# insert into tdup1 values (11, '白血病怎么治?'); INSERT 0 1 postgres=# insert into tdup1 values (22, '白血病怎么治疗?'); INSERT 0 1 postgres=# insert into tdup1 values (13, '白血病怎么治疗好?'); INSERT 0 1 postgres=# insert into tdup1 values (24, '白血病怎么能治疗好?'); INSERT 0 1 postgres=# postgres=# with t(c1,c2,c3) as (select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1) select * from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2) simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t where simulate>0.5; t1c1 | t2c1 | t1c2 | t2c2 | t1c3 | t2c3 | simulate ------+------+--------------------+----------------------+-------------------+-------------------+---------- 2 | 3 | 银屑病怎么治疗? | 银屑病怎么治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 2 | 4 | 银屑病怎么治疗? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 3 | 4 | 银屑病怎么治疗好? | 银屑病怎么能治疗好? | {'银屑病','治疗'} | {'银屑病','治疗'} | 1.00 22 | 24 | 白血病怎么治疗? | 白血病怎么能治疗好? | {'治疗','白血病'} | {'治疗','白血病'} | 1.00 13 | 22 | 白血病怎么治疗好? | 白血病怎么治疗? | {'治疗','白血病'} | {'治疗','白血病'} | 1.00 13 | 24 | 白血病怎么治疗好? | 白血病怎么能治疗好? | {'治疗','白血病'} | {'治疗','白血病'} | 1.00 (6 rows) postgres=# begin; BEGIN postgres=# delete from tdup1 where id in (with t(c1,c2,c3) as postgres(# (select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1) postgres(# select t2c1 from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2) postgres(# simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t where simulate>0.5); DELETE 4 postgres=# select * from tdup1 ; id | info ----+-------------------- 1 | 银屑病怎么治? 2 | 银屑病怎么治疗? 11 | 白血病怎么治? 13 | 白血病怎么治疗好? (4 rows) 用数据库解会遇到的问题, 因为我们的JOIN filter是<>和<,用不上hashjoin。 数据量比较大的情况下,耗时会非常的长。 postgres=# explain delete from tdup1 where id in (with t(c1,c2,c3) as (select id,info,array_dist(regexp_split_to_array((regexp_replace(to_tsvector('jiebacfg',info)::text,'(:\d+)', '', 'g')),' ')) from tdup1) select t2c1 from (select t1.c1 t1c1,t2.c1 t2c1,t1.c2 t1c2,t2.c2 t2c2,t1.c3 t1c3,t2.c3 t2c3,round(array_length(array_share(t1.c3,t2.c3),1)::numeric/array_length(array_union(t1.c3,t2.c3),1),2) simulate from t t1,t t2 where t1.c1<>t2.c1 and t1.c1<t2.c1) t where simulate>0.5); QUERY PLAN ---------------------------------------------------------------------------------------------------------------------- Delete on tdup1 (cost=10005260133.58..10005260215.84 rows=2555 width=34) -> Hash Join (cost=10005260133.58..10005260215.84 rows=2555 width=34) Hash Cond: (tdup1.id = "ANY_subquery".t2c1) -> Seq Scan on tdup1 (cost=0.00..61.10 rows=5110 width=10) -> Hash (cost=10005260131.08..10005260131.08 rows=200 width=32) -> HashAggregate (cost=10005260129.08..10005260131.08 rows=200 width=32) Group Key: "ANY_subquery".t2c1 -> Subquery Scan on "ANY_subquery" (cost=10000002667.20..10005252911.99 rows=2886838 width=32) -> Subquery Scan on t (cost=10000002667.20..10005224043.61 rows=2886838 width=4) Filter: (t.simulate > 0.5) CTE t -> Seq Scan on tdup1 tdup1_1 (cost=0.00..2667.20 rows=5110 width=36) -> Nested Loop (cost=10000000000.00..10005113119.99 rows=8660513 width=68) Join Filter: ((t1.c1 <> t2.c1) AND (t1.c1 < t2.c1)) -> CTE Scan on t t1 (cost=0.00..102.20 rows=5110 width=36) -> CTE Scan on t t2 (cost=0.00..102.20 rows=5110 width=36) (16 rows) 其他更优雅的方法,使用PLR或者R进行矩阵运算,得出结果后再进行筛选。 PLR R 或者使用MPP数据库例如Greenplum加上R和madlib可以对非常庞大的数据进行处理。 MADLIB MPP 小结 这里用到了PG的什么特性? .1. 中文分词 .2. 窗口查询功能 (本例中没有用到,但是如果你的数据没有主键时,则需要用ctid和row_number来定位到一条唯一记录)
德哥 2019-12-02 01:43:06 0 浏览量 回答数 0

问题

是否有可能将保存切片数组的4个变量保存为单个变量?

(Noob)我有一个程序,它获取一个数组,根据其各自的功能对某些元素进行切片,然后arCalc对它们进行操作。从arSlice返回所有这些值似乎不可行,因为main()中的函数调用需要...
kun坤 2019-12-27 10:12:44 1 浏览量 回答数 1

问题

从截断的高斯分布生成numpy向量化值

我有一个函数,它从截断的正态分布中生成一个值,并带有一个while循环,该循环可确保舍弃位于截断之外的任何生成的值,并将其替换为另一代,直到其位于范围之内。 def g...
is大龙 2020-03-24 22:47:23 0 浏览量 回答数 1

回答

最大似然估计(MLE)是获得分布参数的点估计的最重要的过程之一。这是您需要开始的。 分析解决方案: 跨国公司分布的一个扩展二项式分布为其MLE可以分析获得。请参阅此数学堆栈交换帖(MLE for Multinomial Distribution)以获得完整的分析解决方案。该过程从定义似然函数开始,L(p)以观测数据x(i)为条件,其中p和x是k 类/类别的概率和观察到的出现,并且i = 0,1,... k。它是给定参数集(p)观察一组观测值(x)的可能性的度量: L(p)等于: 主要思想是在参数范围(p)上最大化似然函数值。给定总观测值n(即所有类别的出现总和),点估计等于: a.values/a.values.sum() # point estimates for p = x/n # array([[0. ], [0.02941176], [0.05882353], [0.08823529], # [0.05882353], [0.02941176], [0.17647059], [0. ], # [0.02941176], [0.02941176], [0.20588235], [0.29411765]]) 数值解: 上述结果也可以用数字方法获得scipy.optimize.minimize。请注意,L(p)是阶乘和指数项的乘积。阶乘项是常数,不依赖于参数值(p),因此不考虑优化。对于指数项,最好执行对数转换以简化目标函数; MLE的常见做法,因为log是单调递增函数。此外,由于scipy.optimize.minimize用于最小化,我们将使用对数变换似然函数的负数。注意 最大化函数值等于最小化其负值。 import pandas as pd import numpy as np import scipy.optimize as sciopt # bounds for parameters to lie between (0,1), # absolute zero (0) for lower bound avoided as log takes an infinite value bnds = [(0.001e-12,1) for i in range(12)] # Initializing parameters value for optimization init_parameters = np.asarray([0.1 for i in range(12)]) # Negative Log Likelihood Function neg_log_lik = lambda p: -np.sum([a.values[i]*np.log(p[i]) for i in range(12)]) # Constraint sum(p) = 1 cons = {'type': 'eq', 'fun': lambda p: (sum([p[i] for i in range(12)]) - 1) } # Minimizing neg_log_lik results = sciopt.minimize(neg_log_lik, x0 = init_parameters, method='SLSQP', bounds= bnds, constraints= cons) results.x # point estimates for p # array([1.00000000e-15, 2.94179308e-02, 5.88243586e-02, 8.82394605e-02, # 5.88243586e-02, 2.94059735e-02, 1.76454713e-01, 1.00000000e-15, # 2.94134577e-02, 2.94135714e-02, 2.05849197e-01, 2.94156978e-01]) 有关上述实现的详细信息,请参阅scipy.optimize.minimize 文档。 不知道对不对
寒喵 2019-12-02 01:08:50 0 浏览量 回答数 0

云产品推荐

上海奇点人才服务相关的云产品 小程序定制 上海微企信息技术相关的云产品 国内短信套餐包 ECS云服务器安全配置相关的云产品 开发者问答 阿里云建站 自然场景识别相关的云产品 万网 小程序开发制作 视频内容分析 视频集锦 代理记账服务 阿里云AIoT