import numpy as np import matplotlib.pyplot as plt from pylab import mpl # matplotlib没有中文字体,动态解决 plt.rcParams['font.sans-serif'] = ['Simhei'] # 显示中文 mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
x_data = [338., 333., 328., 207., 226., 25., 179., 60., 208., 606.] y_data = [640., 633., 619., 393., 428., 27., 193., 66., 226., 1591.] x_d = np.asarray(x_data) y_d = np.asarray(y_data)
arange,numpy.arange(start, stop, step, dtype = None)生成数组
值在半开区间 [开始,停止]内生成(换句话说,包括开始但不包括停止的区间),返回的是 ndarray 。
x = np.arange(-200, -100, 1) y = np.arange(-5, 5, 0.1) Z = np.zeros((len(x), len(y))) X, Y = np.meshgrid(x, y)
# loss for i in range(len(x)): for j in range(len(y)): b = x[i] w = y[j] Z[j][i] = 0 # meshgrid吐出结果:y为行,x为列 for n in range(len(x_data)): Z[j][i] += (y_data[n] - b - w * x_data[n]) ** 2 Z[j][i] /= len(x_data)
# linear regression b = -120 w = -4 #b=-2 #w=0.01 lr = 0.0000001 iteration = 100000 b_history = [b] w_history = [w] loss_history = [] import time start = time.time() for i in range(iteration): m = float(len(x_d)) y_hat = w * x_d +b loss = np.dot(y_d - y_hat, y_d - y_hat) / m grad_b = -2.0 * np.sum(y_d - y_hat) / m grad_w = -2.0 * np.dot(y_d - y_hat, x_d) / m # update param b -= lr * grad_b w -= lr * grad_w b_history.append(b) w_history.append(w) loss_history.append(loss) if i % 10000 == 0: print("Step %i, w: %0.4f, b: %.4f, Loss: %.4f" % (i, w, b, loss)) end = time.time() print("大约需要时间:",end-start)
Step 0, w: 1.6534, b: -119.9839, Loss: 3670819.0000 Step 10000, w: 2.4781, b: -121.8628, Loss: 11428.6652 Step 20000, w: 2.4834, b: -123.6924, Loss: 11361.7161 Step 30000, w: 2.4885, b: -125.4716, Loss: 11298.3964 Step 40000, w: 2.4935, b: -127.2020, Loss: 11238.5092 Step 50000, w: 2.4983, b: -128.8848, Loss: 11181.8685 Step 60000, w: 2.5030, b: -130.5213, Loss: 11128.2983 Step 70000, w: 2.5076, b: -132.1129, Loss: 11077.6321 Step 80000, w: 2.5120, b: -133.6607, Loss: 11029.7126 Step 90000, w: 2.5164, b: -135.1660, Loss: 10984.3908 大约需要时间: 1.8699753284454346
# plot the figure plt.contourf(x, y, Z, 50, alpha=0.5, cmap=plt.get_cmap('jet')) # 填充等高线 # 标记最优解的位置为橙色的X符号 plt.plot([-188.4], [2.67], 'x', ms=12, mew=3, color="orange") # 标记迭代过程中的线条为黑色线 plt.plot(b_history, w_history, 'o-', ms=3, lw=1.5, color='black') plt.xlim(-200, -100)# 定义直方图的横纵坐标的范围 plt.ylim(-5, 5) plt.xlabel(r'$b$') plt.ylabel(r'$w$') plt.title("线性回归") plt.show()
横坐标是b,纵坐标是w,标记×位(上图的橙色叉叉)最优解,显然,在图中我们并没有运行得到最优解,最优解十分的遥远。那么我们就调大learning rate,lr = 0.000001(调大10倍),得到结果如下图,发现还不如一开始的lr值效果好。
我们再调大learning rate,lr = 0.00001(调大10倍),得到结果如下图,发现更加接近最优解了,但是在b=-120的这条竖线往上随着迭代过程中出现剧烈震荡的现象:
结果发现learning rate太大了,结果很不好。
3.给b和w特制化两种learning rate
所以我们给b和w特制化两种learning rate
# linear regression b = -120 w = -4 lr = 1 iteration = 100000 b_history = [b] w_history = [w] lr_b=0 lr_w=0 import time start = time.time() for i in range(iteration): b_grad=0.0 w_grad=0.0 for n in range(len(x_data)): b_grad=b_grad-2.0*(y_data[n]-n-w*x_data[n])*1.0 w_grad= w_grad-2.0*(y_data[n]-n-w*x_data[n])*x_data[n] lr_b=lr_b+b_grad**2 lr_w=lr_w+w_grad**2 # update param b -= lr/np.sqrt(lr_b) * b_grad w -= lr /np.sqrt(lr_w) * w_grad b_history.append(b) w_history.append(w)
# plot the figure plt.contourf(x, y, Z, 50, alpha=0.5, cmap=plt.get_cmap('jet')) # 填充等高线 plt.plot([-188.4], [2.67], 'x', ms=12, mew=3, color="orange") plt.plot(b_history, w_history, 'o-', ms=3, lw=1.5, color='black') plt.xlim(-200, -100) plt.ylim(-5, 5) plt.xlabel(r'$b$') plt.ylabel(r'$w$') plt.title("线性回归") plt.show()
有了新的特制化两种learning rate就可以在10w次迭代之内到达最优点了。
第二部分:Python Basics with Numpy
Avoid using for-loops and while-loops, unless you are explicitly told to do so.就是少用或者不用循环,防止时间复杂度过高。
在anaconda jupyter上运行代码只需要shift+enter;如果如要查询文档,如np.exp的详解,就直接开一个新的cell写np.exp?即可。或者看官方文档:https://docs.scipy.org/doc/numpy-1.10.1/reference/generated/numpy.exp.html
Be able to use numpy functions and numpy matrix/vector operations
理解广播 “broadcasting”
Be able to vectorize code
sigmoid ( x ) = 1 1 + e − x \operatorname{sigmoid}(x)=\frac{1}{1+e^{-x}}
- ps:使用包的用法package_name.function(),如math.exp()。
import math # from public_tests import * # GRADED FUNCTION: basic_sigmoid def basic_sigmoid(x): """ Compute sigmoid of x. Arguments: x -- A scalar Return: s -- sigmoid(x) """ # (≈ 1 line of code) # s = # YOUR CODE STARTS HERE s = 1/(1 + math.exp(-x)) # YOUR CODE ENDS HERE return s print("basic_sigmoid(1) = " + str(basic_sigmoid(1))) # 打印输出basic_sigmoid(1) = 0.7310585786300049
### One reason why we use "numpy" instead of "math" in Deep Learning ### x = [1, 2, 3] basic_sigmoid(x) # you will see this give an error when you run it, because x is a --------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-6-8ccefa5bf989> in <module>() 1 ### One reason why we use "numpy" instead of "math" in Deep Learning ### 2 x = [1, 2, 3] ----> 3 basic_sigmoid(x) # you will see this give an error when you run it, because x is a vector. <ipython-input-3-28be2b65a92e> in basic_sigmoid(x) 15 16 ### START CODE HERE ### (≈ 1 line of code) ---> 17 s = 1/(1 + math.exp(-x)) 18 ### END CODE HERE ### 19 TypeError: bad operand type for unary -: 'list'
import numpy as np # example of np.exp t_x = np.array([1, 2, 3]) print(np.exp(t_x)) # result is (exp(1), exp(2), exp(3)) # 输出[ 2.71828183 7.3890561 20.08553692]
# GRADED FUNCTION: sigmoid def sigmoid(x): """ Compute the sigmoid of x Arguments: x -- A scalar or numpy array of any size Return: s -- sigmoid(x) """ # (≈ 1 line of code) # s = # YOUR CODE STARTS HERE s = 1/(1 + np.exp(-x)) # YOUR CODE ENDS HERE return s t_x = np.array([1, 2, 3]) print("sigmoid(t_x) = " + str(sigmoid(t_x))) sigmoid_test(sigmoid) # 输出为sigmoid(t_x) = [0.73105858 0.88079708 0.95257413]
2)sigmoid gradient
通过反向传播(backpropagation)计算梯度来优化损失函数(loss functions)
# GRADED FUNCTION: sigmoid_derivative def sigmoid_derivative(x): """ Compute the gradient (also called the slope or derivative) of the sigmoid function with respect to its input x. You can store the output of the sigmoid function into variables and then use it to calculate the gradient. Arguments: x -- A scalar or numpy array Return: ds -- Your computed gradient. """ #(≈ 2 lines of code) # s = # ds = # YOUR CODE STARTS HERE s = sigmoid(x) ds = s * (1 - s) # YOUR CODE ENDS HERE return ds t_x = np.array([1, 2, 3]) print ("sigmoid_derivative(t_x) = " + str(sigmoid_derivative(t_x))) # 打印sigmoid_derivative(t_x) = [0.19661193 0.10499359 0.04517666] #sigmoid_derivative_test(sigmoid_derivative)
类似numpy的shape和reshape函数,如果我们要将一张图片,3维矩阵(length,height,depth=3)作为算法的输入时,需要将其形状转为(length * height * 3,1),即将3维张量展开成一维向量。
(1)reshape an array v of shape (a, b, c) into a vector of shape (a*b,c)。甚至也可以使用v = v.reshape(-1, 1)。
v = v.reshape((v.shape[0] * v.shape[1], v.shape[2])) # v.shape[0] = a ; v.shape[1] = b ; v.shape[2] = c
# GRADED FUNCTION:image2vector def image2vector(image): """ Argument: image -- a numpy array of shape (length, height, depth) Returns: v -- a vector of shape (length*height*depth, 1) """ # (≈ 1 line of code) # v = # YOUR CODE STARTS HERE v = image.reshape(image.shape[0] * image.shape[1] * image.shape[2], 1) # YOUR CODE ENDS HERE return v # This is a 3 by 3 by 2 array, typically images will be (num_px_x, num_px_y,3) where 3 represents the RGB values t_image = np.array([[[ 0.67826139, 0.29380381], [ 0.90714982, 0.52835647], [ 0.4215251 , 0.45017551]], [[ 0.92814219, 0.96677647], [ 0.85304703, 0.52351845], [ 0.19981397, 0.27417313]], [[ 0.60659855, 0.00533165], [ 0.10820313, 0.49978937], [ 0.34144279, 0.94630077]]]) print ("image2vector(image) = " + str(image2vector(t_image)))
image2vector(image) = [[0.67826139] [0.29380381] [0.90714982] [0.52835647] [0.4215251 ] [0.45017551] [0.92814219] [0.96677647] [0.85304703] [0.52351845] [0.19981397] [0.27417313] [0.60659855] [0.00533165] [0.10820313] [0.49978937] [0.34144279] [0.94630077]]
在下面的n p . l i n a l g . n o r m np.linalg.normnp.linalg.norm的参数ord就表示矩阵的范数:
x = [ 0 3 4 2 6 4 ] x=\left[
计算行向量的模长,得到(2,1)的矩阵∥x∥=np .linalg.norm (x, axis =1, keepdims = True )=[556]
# GRADED FUNCTION: normalize_rows def normalize_rows(x): """ Implement a function that normalizes each row of the matrix x (to have unit length). Argument: x -- A numpy matrix of shape (n, m) Returns: x -- The normalized (by row) numpy matrix. You are allowed to modify x. """ #(≈ 2 lines of code) # Compute x_norm as the norm 2 of x. Use np.linalg.norm(..., ord = 2, axis = ..., keepdims = True) # x_norm = # Divide x by its norm. # x = # YOUR CODE STARTS HERE x_norm = np.linalg.norm(x, axis = 1, keepdims = True) x = x / x_norm print(x.shape) print(x_norm.shape) # YOUR CODE ENDS HERE return x x = np.array([[0, 3, 4], [1, 6, 4]]) print("normalizeRows(x) = " + str(normalize_rows(x))) # normalizeRows_test(normalize_rows)
注意上面不要使用x /= x_norm。对于矩阵除法,numpy必须广播x_norm,这是操作符/=不支持的。如下所示的x_norm的shape是(2,1)这个运算过程就用了广播(下面会说)。
(2, 3) (2, 1) normalizeRows(x) = [[0. 0.6 0.8 ] [0.13736056 0.82416338 0.54944226]]
5)广播和softmax function
广播法则(B r o a d c a s t BroadcastBroadcast)是科学运算中经常用到的一个技巧,它在快速执行向量化的同时不会占用额外的内存/显存。Numpy的广播法则定义如下:
for x ∈ R 1 × n \text { for } x \in \mathbb{R}^{1 \times n} \text { } for x∈R
Notes: 后面部分会用m表示number of training examples,and each training example is in its own column of the matrix. Also, each feature will be in its own row (each row has data for the same feature).
Softmax should be performed for all features of each training example, so softmax would be performed on the columns (once we switch to that representation later in this course).
# GRADED FUNCTION: softmax def softmax(x): """Calculates the softmax for each row of the input x. Your code should work for a row vector and also for matrices of shape (m,n). Argument: x -- A numpy matrix of shape (m,n) Returns: s -- A numpy matrix equal to the softmax of x, of shape (m,n) """ # YOUR CODE STARTS HERE #Apply exp() element-wise to x. Use np.exp(...). x_exp = np.exp(x) # Create a vector x_sum that sums each row of x_exp. Use np.sum(..., axis = 1, keepdims = True). x_sum = np.sum(x_exp, axis = 1,keepdims = True) # Compute softmax(x) by dividing x_exp by x_sum. It should automatically use numpy broadcasting. s = x_exp / x_sum # YOUR CODE ENDS HERE return s t_x = np.array([[9, 2, 5, 0, 0], [7, 5, 0, 0 ,0]]) print("softmax(x) = " + str(softmax(t_x))) # softmax_test(softmax)
softmax(x) = [[9.80897665e-01 8.94462891e-04 1.79657674e-02 1.21052389e-04 1.21052389e-04] [8.78679856e-01 1.18916387e-01 8.01252314e-04 8.01252314e-04 8.01252314e-04]]
通过数据向量化能大大降低计算的时间复杂度,搞清楚dot/outer/elementwise product的区别。
import time x1 = [9, 2, 5, 0, 0, 7, 5, 0, 0, 0, 9, 2, 5, 0, 0] x2 = [9, 2, 2, 9, 0, 9, 2, 5, 0, 0, 9, 2, 5, 0, 0] ### CLASSIC DOT PRODUCT OF VECTORS IMPLEMENTATION ### tic = time.process_time() dot = 0 for i in range(len(x1)): dot += x1[i] * x2[i] toc = time.process_time() print ("dot = " + str(dot) + "\n ----- Computation time = " + str(1000 * (toc - tic)) + "ms") ### CLASSIC OUTER PRODUCT IMPLEMENTATION ### tic = time.process_time() outer = np.zeros((len(x1), len(x2))) # we create a len(x1)*len(x2) matrix with only zeros for i in range(len(x1)): for j in range(len(x2)): outer[i,j] = x1[i] * x2[j] toc = time.process_time() print ("outer = " + str(outer) + "\n ----- Computation time = " + str(1000 * (toc - tic)) + "ms") ### CLASSIC ELEMENTWISE IMPLEMENTATION ### tic = time.process_time() mul = np.zeros(len(x1)) for i in range(len(x1)): mul[i] = x1[i] * x2[i] toc = time.process_time() print ("elementwise multiplication = " + str(mul) + "\n ----- Computation time = " + str(1000 * (toc - tic)) + "ms") ### CLASSIC GENERAL DOT PRODUCT IMPLEMENTATION ### W = np.random.rand(3,len(x1)) # Random 3*len(x1) numpy array tic = time.process_time() gdot = np.zeros(W.shape[0]) for i in range(W.shape[0]): for j in range(len(x1)): gdot[i] += W[i,j] * x1[j] toc = time.process_time() print ("gdot = " + str(gdot) + "\n ----- Computation time = " + str(1000 * (toc - tic)) + "ms")
dot = 278 ----- Computation time = 0.0ms outer = [[81. 18. 18. 81. 0. 81. 18. 45. 0. 0. 81. 18. 45. 0. 0.] [18. 4. 4. 18. 0. 18. 4. 10. 0. 0. 18. 4. 10. 0. 0.] [45. 10. 10. 45. 0. 45. 10. 25. 0. 0. 45. 10. 25. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [63. 14. 14. 63. 0. 63. 14. 35. 0. 0. 63. 14. 35. 0. 0.] [45. 10. 10. 45. 0. 45. 10. 25. 0. 0. 45. 10. 25. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [81. 18. 18. 81. 0. 81. 18. 45. 0. 0. 81. 18. 45. 0. 0.] [18. 4. 4. 18. 0. 18. 4. 10. 0. 0. 18. 4. 10. 0. 0.] [45. 10. 10. 45. 0. 45. 10. 25. 0. 0. 45. 10. 25. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]] ----- Computation time = 0.0ms elementwise multiplication = [81. 4. 10. 0. 0. 63. 10. 0. 0. 0. 81. 4. 25. 0. 0.] ----- Computation time = 0.0ms gdot = [29.69127752 19.45650093 30.14580312] ----- Computation time = 0.0ms
x1 = [9, 2, 5, 0, 0, 7, 5, 0, 0, 0, 9, 2, 5, 0, 0] x2 = [9, 2, 2, 9, 0, 9, 2, 5, 0, 0, 9, 2, 5, 0, 0] ### VECTORIZED DOT PRODUCT OF VECTORS ### tic = time.process_time() dot = np.dot(x1,x2) toc = time.process_time() print ("dot = " + str(dot) + "\n ----- Computation time = " + str(1000 * (toc - tic)) + "ms") ### VECTORIZED OUTER PRODUCT ### tic = time.process_time() outer = np.outer(x1,x2) toc = time.process_time() print ("outer = " + str(outer) + "\n ----- Computation time = " + str(1000 * (toc - tic)) + "ms") ### VECTORIZED ELEMENTWISE MULTIPLICATION ### tic = time.process_time() mul = np.multiply(x1,x2) toc = time.process_time() print ("elementwise multiplication = " + str(mul) + "\n ----- Computation time = " + str(1000*(toc - tic)) + "ms") ### VECTORIZED GENERAL DOT PRODUCT ### tic = time.process_time() dot = np.dot(W,x1) toc = time.process_time() print ("gdot = " + str(dot) + "\n ----- Computation time = " + str(1000 * (toc - tic)) + "ms")
dot = 278 ----- Computation time = 0.0ms outer = [[81 18 18 81 0 81 18 45 0 0 81 18 45 0 0] [18 4 4 18 0 18 4 10 0 0 18 4 10 0 0] [45 10 10 45 0 45 10 25 0 0 45 10 25 0 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [63 14 14 63 0 63 14 35 0 0 63 14 35 0 0] [45 10 10 45 0 45 10 25 0 0 45 10 25 0 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [81 18 18 81 0 81 18 45 0 0 81 18 45 0 0] [18 4 4 18 0 18 4 10 0 0 18 4 10 0 0] [45 10 10 45 0 45 10 25 0 0 45 10 25 0 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]] ----- Computation time = 0.0ms elementwise multiplication = [81 4 10 0 0 63 10 0 0 0 81 4 25 0 0] ----- Computation time = 0.0ms gdot = [29.69127752 19.45650093 30.14580312] ----- Computation time = 0.0ms
执行矩阵-矩阵 或者 矩阵-向量的乘法运算,他和np.multiply()
1)L1和L2 loss function
# GRADED FUNCTION: L1 def L1(yhat, y): """ Arguments: yhat -- vector of size m (predicted labels) y -- vector of size m (true labels) Returns: loss -- the value of the L1 loss function defined above """ ### START CODE HERE ### (≈ 1 line of code) loss = np.sum(np.abs(yhat - y)) ### END CODE HERE ### return loss yhat = np.array([.9, 0.2, 0.1, .4, .9]) y = np.array([1, 0, 0, 1, 1]) print("L1 = " + str(L1(yhat,y))) >>> L1 = 1.1
# GRADED FUNCTION: L2 def L2(yhat, y): """ Arguments: yhat -- vector of size m (predicted labels) y -- vector of size m (true labels) Returns: loss -- the value of the L2 loss function defined above """ ### START CODE HERE ### (≈ 1 line of code) loss = np.dot((yhat - y),(yhat - y).T) ### END CODE HERE ### return loss yhat = np.array([.9, 0.2, 0.1, .4, .9]) y = np.array([1, 0, 0, 1, 1]) print("L2 = " + str(L2(yhat,y))) >>> L2 = 0.43
datawhale course