目录
Audio2Face简介
在元宇宙的热潮下,为了让AI数字人渗透到更多的领域中,FACEGOOD已经将语音驱动口型的算法技术开源,开源地址:
https://github.com/FACEGOOD/FACEGOOD-Audio2Face
该技术可以实时将音频数据转换为驱动数字人面部BlendShape的权重数据,不同于ARKit中的52个BlendShape,它的数量多达116个,我们可以通过对应关系得到相应的数值,对应关系如下:
ARKit | Voice2Face | ||||
eyeBlinkLeft | eye_blink2_l | ||||
eyeLookDownLeft | eye_lookDown2_l | ||||
eyeLookInLeft | eye_lookRight_l | ||||
eyeLookOutLeft | eye_lookLeft_l | ||||
eyeLookUpLeft | eye_lookUp_l | ||||
eyeSquintLeft | eye_shutTight_l | ||||
eyeWideLeft | max(eye_downLidRaise_l,eye_upLidRaise_l) | ||||
eyeBlinkRight | eye_blink2_r | ||||
eyeLookDownRight | eye_lookDown2_r | ||||
eyeLookInRight | eye_lookRight_r | ||||
eyeLookOutRight | eye_lookLeft_r | ||||
eyeLookUpRight | eye_lookUp_r | ||||
eyeSquintRight | eye_shutTight_r | ||||
eyeWideRight | max(eye_downLidRaise_r,eye_upLidRaise_r) | ||||
jawForward | jaw_thrust_c | ||||
jawLeft | jaw_sideways_l | ||||
jawRight | jaw_sideways_r | ||||
jawOpen | mouth_stretch_c | ||||
mouthClose | mouth_chew_c | ||||
mouthFunnel | max(mouth_funnel_dl,mouth_funnel_dr,mouth_funnel_ul,mouth_funnel_ur) | ||||
mouthPucker | max(mouth_pucker_l,mouth_pucker_r) | ||||
mouthLeft | mouth_sideways_l | ||||
mouthRight | mouth_sideways_r | ||||
mouthSmileLeft | mouth_lipCornerPull_l | ||||
mouthSmileRight | mouth_lipCornerPull_r | ||||
mouthFrownLeft | max(mouth_lipCornerDepress_l,mouth_lipCornerDepressFix_l) | ||||
mouthFrownRight | max(mouth_lipCornerDepress_r,mouth_lipCornerDepressFix_r) | ||||
mouthDimpleLeft | mouth_dimple_l | ||||
mouthDimpleRight | mouth_dimple_r | ||||
mouthStretchLeft | mouth_lipStretch_l | ||||
mouthStretchRight | mouth_lipStretch_r | ||||
mouthRollLower | max(mouth_suck_dl,mouth_suck_dr) | ||||
mouthRollUpper | max(mouth_suck_ul,mouth_suck_ur) | ||||
mouthShrugLower | mouth_chinRaise_d | ||||
mouthShrugUpper | mouth_chinRaise_u | ||||
mouthPressLeft | mouth_press_l | ||||
mouthPressRight | mouth_press_r | ||||
mouthLowerDownLeft | mouth_lowerLipDepress_l | ||||
mouthLowerDownRight | mouth_lowerLipDepress_r | ||||
mouthUpperUpLeft | mouth_upperLipRaise_l | ||||
mouthUpperUpRight | mouth_upperLipRaise_r | ||||
browDownleft | brow_lower_l | ||||
browDownRight | brow_lower_r | ||||
browInnerUp | brow_raise_c | ||||
browOuterUpLeft | brow_raise_l | ||||
browOuterUpRight | brow_raise_r | ||||
cheekPuff | max(cheek_puff_l,cheek_puff_r) | ||||
cheekSquintLeft | cheek_up | ||||
cheekSquintRight | cheek_up | ||||
noseSneerLeft | nose_out_l | ||||
noseSneerRight | nose_out_r | ||||
tongueOut |
生产的数据结果如下图所示,可见是116个取值范围为-1~1的小数:
这116个数值依次对应下面116个BlendShape名称:
brow_lower_l tongue_Scale__X tongue_Scale_Y tongue_Scale__Y tongue_Scale_Z tongue_Scale__Z nose_out_l nose_out_r tongue_u tongue_u_u brow_raise_d cheek_suck_r mouth_stretch_u tongue_u_d tooth_d_d tongue_d tooth_r tooth_d_u cheek_UP eye_blink1_l eye_blink1_r eye_blink2_l eye_blink2_r eye_lidTight_l eye_lidTight_r eye_shutTight_l eye_shutTight_r brow_lower_r eye_upperLidRaise_l eye_upperLidRaise_r eye_downLidRaise_l eye_downLidRaise_r jaw_sideways_l jaw_sideways_r jaw_thrust_c mouth_chew_c mouth_chinRaise_d mouth_chinRaise_u brow_raise_c mouth_dimple_l mouth_dimple_r mouth_funnel_dl mouth_funnel_dr mouth_funnel_ul mouth_funnel_ur mouth_lipCornerDepressFix_l mouth_lipCornerDepressFix_r mouth_lipCornerDepress_l mouth_lipCornerDepress_r brow_raise_l mouth_lipCornerPullOpen_l mouth_lipCornerPullOpen_r mouth_lipCornerPull_l mouth_lipCornerPull_r mouth_lipStretchOpen_l mouth_lipStretchOpen_r mouth_lipStretch_l mouth_lipStretch_r mouth_lowerLipDepress_l mouth_lowerLipDepress_r brow_raise_r mouth_lowerLipProtrude_c mouth_oh_c mouth_oo_c mouth_pressFix_c mouth_press_l mouth_press_r mouth_pucker_l mouth_pucker_r mouth_screamFix_c mouth_sideways_l cheek_puff_l mouth_sideways_r mouth_stretch_c mouth_suck_dl mouth_suck_dr mouth_suck_ul mouth_suck_ur mouth_upperLipRaise_l mouth_upperLipRaise_r nose_wrinkle_l nose_wrinkle_r cheek_puff_r tooth_l eye_lookDown1_l eye_lookDown2_l eye_lookLeft_l eye_lookRight_l eye_lookUp_l eye_lookDown1_r eye_lookDown2_r eye_lookLeft_r eye_lookRight_r cheek_raise_l eye_lookUp_r tongue_Rot_1X tongue_Rot__1X tongue_Rot_2X tongue_Rot__2X tongue_Rot_3X tongue_Rot__3X tongue_Rot_1Y tongue_Rot__1Y tongue_Rot_2Y cheek_raise_r tongue_Rot__2Y tongue_Rot_3Y tongue_Rot__3Y tongue_Rot_1Z tongue_Rot__1Z tongue_Rot_2Z tongue_Rot__2Z tongue_Rot_3Z tongue_Rot__3Z tongue_Scale_X cheek_suck_l
在Unity中应用
可以用过构建python服务,Unity客户端开启麦克风录制音频,将音频数据发送给python服务端,服务端转换为驱动BlendShape的权重数据后,返回给Unity客户端进行驱动。需要注意的是Unity中BlendShape的权重范围并不是[-1,1],因此需要进行映射。
例如:
//将[-1,1]映射到[-100,100] private float Remap(float v) { return v * 100f; }
下面是一段测试音频产生的bs权重数据文件,每一行包含116个权重数值,我们拿来进行测试,将其放到StreamingAssets文件夹下。
测试模型:
测试代码:
using System.IO; using System.Collections; using System.Collections.Generic; using UnityEngine; public class TEST : MonoBehaviour { private Coroutine coroutine; private SkinnedMeshRenderer smr; private readonly List<List<float>> valueList = new List<List<float>>(); private IEnumerator Start() { smr = GetComponent<SkinnedMeshRenderer>(); string path = Path.Combine(Application.streamingAssetsPath, "weight.txt"); using (StreamReader streamReader = new StreamReader(path)) { string content; while ((content = streamReader.ReadLine()) != null) { List<float> list = new List<float>(); content = content.Trim(); string[] splitArray = content.Split(' '); for (int i = 0; i < splitArray.Length; i++) { float.TryParse(splitArray[i], out float result); list.Add(result); } valueList.Add(list); yield return null; } } } private IEnumerator ExecuteCoroutine() { for (int i = 0; i < valueList.Count; i++) { List<float> list = valueList[i]; smr.SetBlendShapeWeight(0, Remap(list[49])); //brow_raise_l smr.SetBlendShapeWeight(1, Remap(list[60])); //brow_raise_r smr.SetBlendShapeWeight(2, Remap(list[25])); //eye_shutTight_l smr.SetBlendShapeWeight(3, Remap(list[26])); //eye_shutTight_r smr.SetBlendShapeWeight(4, Remap(list[87])); //eye_lookRight_l smr.SetBlendShapeWeight(5, Remap(list[86])); //eye_lookLeft_l smr.SetBlendShapeWeight(6, Remap(list[92])); //eye_lookRight_r smr.SetBlendShapeWeight(7, Remap(list[91])); //eye_lookLeft_r smr.SetBlendShapeWeight(8, Remap(list[88])); //eye_lookUp_l smr.SetBlendShapeWeight(9, Remap(list[94])); //eye_lookUp_r smr.SetBlendShapeWeight(10, Remap(list[85])); //eye_lookDown2_l smr.SetBlendShapeWeight(11, Remap(list[90])); //eye_lookDown2_r smr.SetBlendShapeWeight(12, Mathf.Max(Remap(list[71]), Remap(list[82]))); //cheek_pull_l cheek_pull_r smr.SetBlendShapeWeight(13, Remap(list[18])); //cheek_UP smr.SetBlendShapeWeight(14, Remap(list[18])); //cheek_UP smr.SetBlendShapeWeight(15, Remap(list[6])); //nose_out_l smr.SetBlendShapeWeight(16, Remap(list[7])); //nose_out_r smr.SetBlendShapeWeight(17, Remap(list[70])); //mouth_sideways_l smr.SetBlendShapeWeight(18, Remap(list[72])); //mouth_sideways_r smr.SetBlendShapeWeight(19, Mathf.Max(Remap(list[67]), Remap(list[68]))); //mouth_pucker_l mouth_pucker_2 smr.SetBlendShapeWeight(20, Mathf.Max(Remap(list[41]), Remap(list[42]), Remap(list[43]), Remap(list[44]))); //mouth_funnel_dl dr ul ur smr.SetBlendShapeWeight(21, Remap(list[52])); //mouth_lipCornerPull_l smr.SetBlendShapeWeight(22, Remap(list[53])); //mouth_lipCornerPull_r smr.SetBlendShapeWeight(23, Mathf.Max(Remap(list[47]), Remap(list[45]))); //mouth_lipCornerDepress_l mouth_lipCornerDepressFix_l smr.SetBlendShapeWeight(24, Mathf.Max(Remap(list[48]), Remap(list[46]))); //mouth_lipCornerDepress_r mouth_lipCornerDepressFix_r smr.SetBlendShapeWeight(25, Remap(list[39])); //mouth_dimple_l smr.SetBlendShapeWeight(26, Remap(list[40])); //mouth_dimple_r smr.SetBlendShapeWeight(27, Remap(list[65])); //mouth_press_l smr.SetBlendShapeWeight(28, Remap(list[66])); //mouth_press_r smr.SetBlendShapeWeight(29, Remap(list[36])); //mouth_chinRaise_d smr.SetBlendShapeWeight(30, Remap(list[37])); //mouth_chinRaise_u smr.SetBlendShapeWeight(31, Remap(list[56])); //mouth_lipStretch_l smr.SetBlendShapeWeight(32, Remap(list[57])); //mouth_lipStretch_r smr.SetBlendShapeWeight(33, Remap(list[78])); //mouth_upperLipRaise_l smr.SetBlendShapeWeight(34, Remap(list[79])); //mouth_upperLipRaise_r smr.SetBlendShapeWeight(35, Remap(list[58])); //mouth_lowerLipDepress_l smr.SetBlendShapeWeight(36, Remap(list[59])); //mouth_lowerLipDepress_r smr.SetBlendShapeWeight(37, Mathf.Max(Remap(list[76]), Remap(list[77]))); //mouth_suck_ul mouth_suck_ur smr.SetBlendShapeWeight(38, Mathf.Max(Remap(list[74]), Remap(list[75]))); //mouth_suck_dl mouth_suck_dr smr.SetBlendShapeWeight(39, Remap(list[35])); //mouth_chew_c smr.SetBlendShapeWeight(40, Remap(list[34])); //jaw_thrust_c smr.SetBlendShapeWeight(41, Remap(list[73])); //mouth_stretch_c smr.SetBlendShapeWeight(42, Remap(list[32])); //jaw_sideways_l smr.SetBlendShapeWeight(43, Remap(list[33])); //jaw_sideways_r smr.SetBlendShapeWeight(44, Remap(list[38])); //brow_raise_c smr.SetBlendShapeWeight(45, Remap(list[22])); //eye_blink2_r smr.SetBlendShapeWeight(46, Remap(list[21])); //eye_blink2_l smr.SetBlendShapeWeight(47, Remap(list[0])); //brow_lower_l smr.SetBlendShapeWeight(48, Remap(list[27])); //brow_lower_r smr.SetBlendShapeWeight(49, Mathf.Max(Remap(list[31]), Remap(list[29]))); //eye_downLidRaise_r eye_upLidRaise_r smr.SetBlendShapeWeight(50, Mathf.Max(Remap(list[30]), Remap(list[28]))); //eye_downLidRaise_l eye_upLidRaise_l yield return new WaitForSeconds(.07f); } coroutine = null; } private float Remap(float v) { return v * 100f; } private void OnGUI() { GUI.enabled = coroutine == null; if (GUILayout.Button("Begin", GUILayout.Width(200f), GUILayout.Height(50f))) { coroutine = StartCoroutine(ExecuteCoroutine()); } GUI.enabled = coroutine != null; if (GUILayout.Button("Stop", GUILayout.Width(200f), GUILayout.Height(50f))) { StopCoroutine(coroutine); coroutine = null; } } }