先说结论:资源使用不合理导致机器性能浪费
起因:线上某应用告警机器负载及资源占用异常,登陆机器查看进程![](https://ucc.alicdn.com/xtm66pk7b23p6/developer-article857412/20241025/d0a1e2ba15eb4ad2abb0b158473d7793.png?x-oss-process=image/resize,w_1400/format,webp)
vmstat发现idle比较低
![](https://ucc.alicdn.com/xtm66pk7b23p6/developer-article857412/20241025/29abb4d1b84946efa42f01175550d57c.png?x-oss-process=image/resize,w_1400/format,webp)
![](https://ucc.alicdn.com/xtm66pk7b23p6/developer-article857412/20241025/6a0c43e5c7e84b8eb16bb67cd84426f2.png?x-oss-process=image/resize,w_1400/format,webp)
![](https://ucc.alicdn.com/xtm66pk7b23p6/developer-article857412/20241025/fecee6d4d9f04f74afef8c309b0acc6d.png?x-oss-process=image/resize,w_1400/format,webp)
通过代码分析出send函数本质是通过一个三方的fasthttp来发送网络api请求
func (p *MNSQueue) ReceiveMessage(respChan chan MessageReceiveResponse, errChan chan error, waitseconds ...int64) {
resource := fmt.Sprintf("queues/%s/%s", p.name, "messages")
if waitseconds != nil {
for _, waitsecond := range waitseconds {
if waitsecond <= 0 {
continue
}
resource = fmt.Sprintf("queues/%s/%s?waitseconds=%d", p.name, "messages", waitsecond)
p.qpsMonitor.checkQPS()
resp := MessageReceiveResponse{}
_, err := send(p.client, p.decoder, GET, nil, nil, resource, &resp)
if err != nil {
// if no
errChan <- err
} else {
respChan <- resp
// return if success, may be too much msg accumulated
return
}
}
} else {
p.qpsMonitor.checkQPS()
resp := MessageReceiveResponse{}
_, err := send(p.client, p.decoder, GET, nil, nil, resource, &resp)
if err != nil {
errChan <- err
} else {
respChan <- resp
}
}
// if no message after waitsecond loop or after once try if no waitsecond offered
return
}
func send(client MNSClient, decoder MNSDecoder, method Method, headers map[string]string, message interface{}, resource string, v interface{}) (statusCode int, err error) {
var resp *fasthttp.Response
if resp, err = client.Send(method, headers, message, resource); err != nil {
return
}
if resp != nil {
statusCode = resp.Header.StatusCode()
if statusCode != fasthttp.StatusCreated &&
statusCode != fasthttp.StatusOK &&
statusCode != fasthttp.StatusNoContent {
// get the response body
// the body is set in error when decoding xml failed
bodyBytes := resp.Body()
var e2 error
err, e2 = decoder.DecodeError(bodyBytes, resource)
if e2 != nil {
err = ERR_UNMARSHAL_ERROR_RESPONSE_FAILED.New(errors.Params{"err": e2, "resp":string(bodyBytes)})
return
}
return
}
if v != nil {
buf := bytes.NewReader(resp.Body())
if e := decoder.Decode(buf, v); e != nil {
err = ERR_UNMARSHAL_RESPONSE_FAILED.New(errors.Params{"err": e})
return
}
}
}
return
}
为什么会有问题?去业务代码里反推调用层
![](https://ucc.alicdn.com/xtm66pk7b23p6/developer-article857412/20241025/1c77456e60fe47c98dbe1c64c1d28799.png?x-oss-process=image/resize,w_1400/format,webp)
![](https://ucc.alicdn.com/xtm66pk7b23p6/developer-article857412/20241025/1130872204e2413fb76db465fb136cfd.png?x-oss-process=image/resize,w_1400/format,webp)
![](https://ucc.alicdn.com/xtm66pk7b23p6/developer-article857412/20241025/0bcebb3267584417be3bd4dd2fdce78c.png?x-oss-process=image/resize,w_1400/format,webp)
重点来了,MNS的消息机制本质是客户端主动去服务器主动拉的
-
如果服务器没有消息,会造成大量的资源浪费
-
分析该业务的线上实际情况,有两个解决思路
![](https://ucc.alicdn.com/xtm66pk7b23p6/developer-article857412/20241025/fc1ed38375b44ae996fea87e73e5870b.png?x-oss-process=image/resize,w_1400/format,webp)
![](https://ucc.alicdn.com/xtm66pk7b23p6/developer-article857412/20241025/5cb167bfe86b49d5ac988d0760e45b08.png?x-oss-process=image/resize,w_1400/format,webp)
![](https://ucc.alicdn.com/xtm66pk7b23p6/developer-article857412/20241025/19391ea7482844caaa33d750517affb7.png?x-oss-process=image/resize,w_1400/format,webp)
为什么上云之前没有:
最后再来总结一下:
-
通过Go的PProf能够帮助我们分析绝大部分线上的异常情况
-
出了问题,如果初步判定是三方库的原因,先不着急下结论,虽然它设计上可能有问题,但也可能是没用对
-
解决问题的方法很多,要结合实际业务情况,作出当前阶段最适合的做法,才是技术同学该追求的东西