Unity万人同屏动态避障 GPU动画 Entities Graphics高性能合批渲染插件的使用_哔哩哔哩_bilibili

由于Dots的限制太多,对于需要dlc或热更的项目来说,Dots就爱莫能助。能不能不用Entities,只用Entities Graphics呢?

当然是可以的,Entities Graphics背后使用的接口就是Batch Renderer Group; 

自定义BatchRenderGroup合批渲染, 可以参考Unity官方文档:Initializing a BatchRendererGroup object - Unity 手册

1. 创建一个BatchRenderGroup对象和Graphics Buffer:

m_BRG = new BatchRendererGroup(this.OnPerformCulling, IntPtr.Zero);

m_InstanceData = new GraphicsBuffer(GraphicsBuffer.Target.Raw,             BufferCountForInstances(kBytesPerInstance, kNumInstances, kExtraBytes),             sizeof(int));

2. 注册需要渲染的Mesh和对应的Material:

m_MeshID = m_BRG.RegisterMesh(mesh); m_MaterialID = m_BRG.RegisterMaterial(material); 

3. 为所有需要渲染的目标创建矩阵数组并传入Graphics Buffer里:

 m_InstanceData.SetData(objectToWorld, 0, (int)(byteAddressObjectToWorld / kSizeOfPackedMatrix), objectToWorld.Length);         m_InstanceData.SetData(worldToObject, 0, (int)(byteAddressWorldToObject / kSizeOfPackedMatrix), worldToObject.Length);

4. 把Graphics Buffer添加到BatchRenderGroup进行批次渲染:

m_BatchID = m_BRG.AddBatch(metadata, m_InstanceData.bufferHandle);

创建BatchRenderGroup需要指定一个OnPerformCulling,在相机Culling时自动回调,这里可以直接使用Unity手册里的示例代码:Creating draw commands - Unity 手册

这里我主要测试的使用BatchRenderGroup合批渲染的性能,使用Job System多线程并行修改矩阵数组的位置和旋转,以控制小人移动起来。

控制小人移动的Job System代码如下:

[BurstCompile]

partial struct RandomMoveJob : IJobParallelFor

{

[ReadOnly]

public Unity.Mathematics.Random random;

[ReadOnly]

public float4 randomPostionRange;

[ReadOnly]

public float m_DeltaTime;

public NativeArray matrices;

public NativeArray targetMovePoints;

public NativeArray obj2WorldArr;

public NativeArray world2ObjArr;

[BurstCompile]

public void Execute(int index)

{

float3 curPos = matrices[index].GetPosition();

float3 dir = targetMovePoints[index] - curPos;

if (Unity.Mathematics.math.lengthsq(dir) < 0.4f)

{

var newTargetPos = targetMovePoints[index];

newTargetPos.x = random.NextFloat(randomPostionRange.x, randomPostionRange.y);

newTargetPos.z = random.NextFloat(randomPostionRange.z, randomPostionRange.w);

targetMovePoints[index] = newTargetPos;

}

dir = math.normalizesafe(targetMovePoints[index] - curPos, Vector3.forward);

curPos += dir * m_DeltaTime;// math.lerp(curPos, targetMovePoints[index], m_DeltaTime);

var mat = matrices[index];

mat.SetTRS(curPos, Quaternion.LookRotation(dir), Vector3.one);

matrices[index] = mat;

var item = obj2WorldArr[index];

item.SetData(mat);

obj2WorldArr[index] = item;

item = world2ObjArr[index];

item.SetData(mat.inverse);

world2ObjArr[index] = item;

}

}

然后在主线程Update每帧Jobs逻辑,把Jobs运算结果传入Graphics Buffer更新即可:

private void Update()

{

NativeArray tempMatrices = new NativeArray(matrices, Allocator.TempJob);

NativeArray tempTargetPoints = new NativeArray(m_TargetPoints, Allocator.TempJob);//worldToObject

NativeArray tempobjectToWorldArr = new NativeArray(matrices.Length, Allocator.TempJob);

NativeArray tempWorldToObjectArr = new NativeArray(matrices.Length, Allocator.TempJob);

random = new Unity.Mathematics.Random((uint)Time.frameCount);

var moveJob = new RandomMoveJob

{

matrices = tempMatrices,

targetMovePoints = tempTargetPoints,

random = random,

m_DeltaTime = Time.deltaTime * 4f,

randomPostionRange = m_randomRange,

obj2WorldArr = tempobjectToWorldArr,

world2ObjArr = tempWorldToObjectArr

};

var moveJobHandle = moveJob.Schedule(tempMatrices.Length, 64);

moveJobHandle.Complete();

matrices = moveJob.matrices.ToArray();

m_TargetPoints = moveJob.targetMovePoints.ToArray();

m_InstanceData.SetData(moveJob.obj2WorldArr, 0, (int)(byteAddressObjectToWorld / kSizeOfPackedMatrix), objectToWorld.Length);

m_InstanceData.SetData(moveJob.world2ObjArr, 0, (int)(byteAddressWorldToObject / kSizeOfPackedMatrix), worldToObject.Length);

tempMatrices.Dispose();

tempTargetPoints.Dispose();

tempobjectToWorldArr.Dispose();

tempWorldToObjectArr.Dispose();

}

Okay,跑起来看看:

 瞬间惊呆了,你没看错,使用Batch Renderer Group创建一万的小人居然能跑600多帧!!! 

难道万人同屏要成了?继续加大药量,创建10万个带有移动逻辑的小人:

 10万个奔跑的3D人物,仍然有100帧以上,有23个线程并行计算移动。

看看性能分析:

 当数量级庞大时,即使Job System + Burst编译再怎么开挂,主线程也会拖后腿的。

100万的压迫感,虽然已经成PPT了:

 难道万人同屏行业难题的门槛就这么被Unity Dots打下来了??

非也,上移动端测试:

同样1万个小人,PC端能达到惊人的600帧,而Android最强骁龙8 Gen2只有10多帧,而且工作线程数才5个; 当数量3000人时,手机端帧数46帧左右,相比传统方式没有任何提升!没错,没有任何提升。

Profiler中可以看到,瓶颈依然是GPU。 而Entities Graphics内部也是通过Batch Renderer Group接口实现,由此可以推断,被吹爆的Entities在移动端因该也是"水土不服":

 结论:

目前为止,我认为使用自定义BatchRendererGroup合批是PC端万人同屏的最优解了。

但是手机端性能瓶颈任重道远。手机端放弃!

最后附上本文BatchRendererGroup测试代码:

using System;

using TMPro;

using Unity.Burst;

using Unity.Collections;

using Unity.Collections.LowLevel.Unsafe;

using Unity.Jobs;

using Unity.Jobs.LowLevel.Unsafe;

using Unity.Mathematics;

using UnityEngine;

using UnityEngine.Rendering;

// The PackedMatrix is a convenience type that converts matrices into

// the format that Unity-provided SRP shaders expect.

struct PackedMatrix

{

public float c0x;

public float c0y;

public float c0z;

public float c1x;

public float c1y;

public float c1z;

public float c2x;

public float c2y;

public float c2z;

public float c3x;

public float c3y;

public float c3z;

public PackedMatrix(Matrix4x4 m)

{

c0x = m.m00;

c0y = m.m10;

c0z = m.m20;

c1x = m.m01;

c1y = m.m11;

c1z = m.m21;

c2x = m.m02;

c2y = m.m12;

c2z = m.m22;

c3x = m.m03;

c3y = m.m13;

c3z = m.m23;

}

public void SetData(Matrix4x4 m)

{

c0x = m.m00;

c0y = m.m10;

c0z = m.m20;

c1x = m.m01;

c1y = m.m11;

c1z = m.m21;

c2x = m.m02;

c2y = m.m12;

c2z = m.m22;

c3x = m.m03;

c3y = m.m13;

c3z = m.m23;

}

}

public class SimpleBRGExample : MonoBehaviour

{

public Mesh mesh;

public Material material;

public TextMeshProUGUI text;

public TextMeshProUGUI workerCountText;

private BatchRendererGroup m_BRG;

private GraphicsBuffer m_InstanceData;

private BatchID m_BatchID;

private BatchMeshID m_MeshID;

private BatchMaterialID m_MaterialID;

// Some helper constants to make calculations more convenient.

private const int kSizeOfMatrix = sizeof(float) * 4 * 4;

private const int kSizeOfPackedMatrix = sizeof(float) * 4 * 3;

private const int kSizeOfFloat4 = sizeof(float) * 4;

private const int kBytesPerInstance = (kSizeOfPackedMatrix * 2) + kSizeOfFloat4;

private const int kExtraBytes = kSizeOfMatrix * 2;

[SerializeField] private int kNumInstances = 20000;

[SerializeField] private int m_RowCount = 200;

private Matrix4x4[] matrices;

private PackedMatrix[] objectToWorld;

private PackedMatrix[] worldToObject;

private Vector4[] colors;

private void Start()

{

m_BRG = new BatchRendererGroup(this.OnPerformCulling, IntPtr.Zero);

m_MeshID = m_BRG.RegisterMesh(mesh);

m_MaterialID = m_BRG.RegisterMaterial(material);

AllocateInstanceDateBuffer();

PopulateInstanceDataBuffer();

text.text = kNumInstances.ToString();

random = new Unity.Mathematics.Random(1);

m_TargetPoints = new float3[kNumInstances];

var offset = new Vector3(m_RowCount, 0, Mathf.CeilToInt(kNumInstances / (float)m_RowCount)) * 0.5f;

m_randomRange = new float4(-offset.x, offset.x, -offset.z, offset.z);

for (int i = 0; i < m_TargetPoints.Length; i++)

{

var newTargetPos = new float3();

newTargetPos.x = random.NextFloat(m_randomRange.x, m_randomRange.y);

newTargetPos.z = random.NextFloat(m_randomRange.z, m_randomRange.w);

m_TargetPoints[i] = newTargetPos;

}

}

float3[] m_TargetPoints;

Unity.Mathematics.Random random;

Vector4 m_randomRange;

private uint byteAddressObjectToWorld;

private uint byteAddressWorldToObject;

private uint byteAddressColor;

private void Update()

{

NativeArray tempMatrices = new NativeArray(matrices, Allocator.TempJob);

NativeArray tempTargetPoints = new NativeArray(m_TargetPoints, Allocator.TempJob);//worldToObject

NativeArray tempobjectToWorldArr = new NativeArray(matrices.Length, Allocator.TempJob);

NativeArray tempWorldToObjectArr = new NativeArray(matrices.Length, Allocator.TempJob);

random = new Unity.Mathematics.Random((uint)Time.frameCount);

var moveJob = new RandomMoveJob

{

matrices = tempMatrices,

targetMovePoints = tempTargetPoints,

random = random,

m_DeltaTime = Time.deltaTime * 4f,

randomPostionRange = m_randomRange,

obj2WorldArr = tempobjectToWorldArr,

world2ObjArr = tempWorldToObjectArr

};

var moveJobHandle = moveJob.Schedule(tempMatrices.Length, 64);

moveJobHandle.Complete();

matrices = moveJob.matrices.ToArray();

m_TargetPoints = moveJob.targetMovePoints.ToArray();

m_InstanceData.SetData(moveJob.obj2WorldArr, 0, (int)(byteAddressObjectToWorld / kSizeOfPackedMatrix), objectToWorld.Length);

m_InstanceData.SetData(moveJob.world2ObjArr, 0, (int)(byteAddressWorldToObject / kSizeOfPackedMatrix), worldToObject.Length);

tempMatrices.Dispose();

tempTargetPoints.Dispose();

tempobjectToWorldArr.Dispose();

tempWorldToObjectArr.Dispose();

workerCountText.text = $"JobWorkerCount:{JobsUtility.JobWorkerCount}";

}

private void AllocateInstanceDateBuffer()

{

m_InstanceData = new GraphicsBuffer(GraphicsBuffer.Target.Raw,

BufferCountForInstances(kBytesPerInstance, kNumInstances, kExtraBytes),

sizeof(int));

}

private void RefreshData()

{

m_InstanceData.SetData(objectToWorld, 0, (int)(byteAddressObjectToWorld / kSizeOfPackedMatrix), objectToWorld.Length);

m_InstanceData.SetData(worldToObject, 0, (int)(byteAddressWorldToObject / kSizeOfPackedMatrix), worldToObject.Length);

}

private void PopulateInstanceDataBuffer()

{

// Place a zero matrix at the start of the instance data buffer, so loads from address 0 return zero.

var zero = new Matrix4x4[1] { Matrix4x4.zero };

// Create transform matrices for three example instances.

matrices = new Matrix4x4[kNumInstances];

// Convert the transform matrices into the packed format that shaders expects.

objectToWorld = new PackedMatrix[kNumInstances];

// Also create packed inverse matrices.

worldToObject = new PackedMatrix[kNumInstances];

// Make all instances have unique colors.

colors = new Vector4[kNumInstances];

var offset = new Vector3(m_RowCount, 0, Mathf.CeilToInt(kNumInstances / (float)m_RowCount)) * 0.5f;

for (int i = 0; i < kNumInstances; i++)

{

matrices[i] = Matrix4x4.Translate(new Vector3(i % m_RowCount, 0, i / m_RowCount) - offset);

objectToWorld[i] = new PackedMatrix(matrices[i]);

worldToObject[i] = new PackedMatrix(matrices[0].inverse);

colors[i] = UnityEngine.Random.ColorHSV();

}

// In this simple example, the instance data is placed into the buffer like this:

// Offset | Description

// 0 | 64 bytes of zeroes, so loads from address 0 return zeroes

// 64 | 32 uninitialized bytes to make working with SetData easier, otherwise unnecessary

// 96 | unity_ObjectToWorld, three packed float3x4 matrices

// 240 | unity_WorldToObject, three packed float3x4 matrices

// 384 | _BaseColor, three float4s

// Calculates start addresses for the different instanced properties. unity_ObjectToWorld starts at

// address 96 instead of 64 which means 32 bits are left uninitialized. This is because the

// computeBufferStartIndex parameter requires the start offset to be divisible by the size of the source

// array element type. In this case, it's the size of PackedMatrix, which is 48.

byteAddressObjectToWorld = kSizeOfPackedMatrix * 2;

byteAddressWorldToObject = byteAddressObjectToWorld + kSizeOfPackedMatrix * (uint)kNumInstances;

byteAddressColor = byteAddressWorldToObject + kSizeOfPackedMatrix * (uint)kNumInstances;

// Upload the instance data to the GraphicsBuffer so the shader can load them.

m_InstanceData.SetData(zero, 0, 0, 1);

m_InstanceData.SetData(objectToWorld, 0, (int)(byteAddressObjectToWorld / kSizeOfPackedMatrix), objectToWorld.Length);

m_InstanceData.SetData(worldToObject, 0, (int)(byteAddressWorldToObject / kSizeOfPackedMatrix), worldToObject.Length);

m_InstanceData.SetData(colors, 0, (int)(byteAddressColor / kSizeOfFloat4), colors.Length);

// Set up metadata values to point to the instance data. Set the most significant bit 0x80000000 in each

// which instructs the shader that the data is an array with one value per instance, indexed by the instance index.

// Any metadata values that the shader uses and not set here will be zero. When such a value is used with

// UNITY_ACCESS_DOTS_INSTANCED_PROP (i.e. without a default), the shader interprets the

// 0x00000000 metadata value and loads from the start of the buffer. The start of the buffer which is

// is a zero matrix so this sort of load is guaranteed to return zero, which is a reasonable default value.

var metadata = new NativeArray(3, Allocator.Temp);

metadata[0] = new MetadataValue { NameID = Shader.PropertyToID("unity_ObjectToWorld"), Value = 0x80000000 | byteAddressObjectToWorld, };

metadata[1] = new MetadataValue { NameID = Shader.PropertyToID("unity_WorldToObject"), Value = 0x80000000 | byteAddressWorldToObject, };

metadata[2] = new MetadataValue { NameID = Shader.PropertyToID("_BaseColor"), Value = 0x80000000 | byteAddressColor, };

// Finally, create a batch for the instances, and make the batch use the GraphicsBuffer with the

// instance data, as well as the metadata values that specify where the properties are.

m_BatchID = m_BRG.AddBatch(metadata, m_InstanceData.bufferHandle);

}

// Raw buffers are allocated in ints. This is a utility method that calculates

// the required number of ints for the data.

int BufferCountForInstances(int bytesPerInstance, int numInstances, int extraBytes = 0)

{

// Round byte counts to int multiples

bytesPerInstance = (bytesPerInstance + sizeof(int) - 1) / sizeof(int) * sizeof(int);

extraBytes = (extraBytes + sizeof(int) - 1) / sizeof(int) * sizeof(int);

int totalBytes = bytesPerInstance * numInstances + extraBytes;

return totalBytes / sizeof(int);

}

private void OnDisable()

{

m_BRG.Dispose();

}

public unsafe JobHandle OnPerformCulling(

BatchRendererGroup rendererGroup,

BatchCullingContext cullingContext,

BatchCullingOutput cullingOutput,

IntPtr userContext)

{

// UnsafeUtility.Malloc() requires an alignment, so use the largest integer type's alignment

// which is a reasonable default.

int alignment = UnsafeUtility.AlignOf();

// Acquire a pointer to the BatchCullingOutputDrawCommands struct so you can easily

// modify it directly.

var drawCommands = (BatchCullingOutputDrawCommands*)cullingOutput.drawCommands.GetUnsafePtr();

// Allocate memory for the output arrays. In a more complicated implementation, you would calculate

// the amount of memory to allocate dynamically based on what is visible.

// This example assumes that all of the instances are visible and thus allocates

// memory for each of them. The necessary allocations are as follows:

// - a single draw command (which draws kNumInstances instances)

// - a single draw range (which covers our single draw command)

// - kNumInstances visible instance indices.

// You must always allocate the arrays using Allocator.TempJob.

drawCommands->drawCommands = (BatchDrawCommand*)UnsafeUtility.Malloc(UnsafeUtility.SizeOf(), alignment, Allocator.TempJob);

drawCommands->drawRanges = (BatchDrawRange*)UnsafeUtility.Malloc(UnsafeUtility.SizeOf(), alignment, Allocator.TempJob);

drawCommands->visibleInstances = (int*)UnsafeUtility.Malloc(kNumInstances * sizeof(int), alignment, Allocator.TempJob);

drawCommands->drawCommandPickingInstanceIDs = null;

drawCommands->drawCommandCount = 1;

drawCommands->drawRangeCount = 1;

drawCommands->visibleInstanceCount = kNumInstances;

// This example doens't use depth sorting, so it leaves instanceSortingPositions as null.

drawCommands->instanceSortingPositions = null;

drawCommands->instanceSortingPositionFloatCount = 0;

// Configure the single draw command to draw kNumInstances instances

// starting from offset 0 in the array, using the batch, material and mesh

// IDs registered in the Start() method. It doesn't set any special flags.

drawCommands->drawCommands[0].visibleOffset = 0;

drawCommands->drawCommands[0].visibleCount = (uint)kNumInstances;

drawCommands->drawCommands[0].batchID = m_BatchID;

drawCommands->drawCommands[0].materialID = m_MaterialID;

drawCommands->drawCommands[0].meshID = m_MeshID;

drawCommands->drawCommands[0].submeshIndex = 0;

drawCommands->drawCommands[0].splitVisibilityMask = 0xff;

drawCommands->drawCommands[0].flags = 0;

drawCommands->drawCommands[0].sortingPosition = 0;

// Configure the single draw range to cover the single draw command which

// is at offset 0.

drawCommands->drawRanges[0].drawCommandsBegin = 0;

drawCommands->drawRanges[0].drawCommandsCount = 1;

// This example doesn't care about shadows or motion vectors, so it leaves everything

// at the default zero values, except the renderingLayerMask which it sets to all ones

// so Unity renders the instances regardless of mask settings.

drawCommands->drawRanges[0].filterSettings = new BatchFilterSettings { renderingLayerMask = 0xffffffff, };

// Finally, write the actual visible instance indices to the array. In a more complicated

// implementation, this output would depend on what is visible, but this example

// assumes that everything is visible.

for (int i = 0; i < kNumInstances; ++i)

drawCommands->visibleInstances[i] = i;

// This simple example doesn't use jobs, so it returns an empty JobHandle.

// Performance-sensitive applications are encouraged to use Burst jobs to implement

// culling and draw command output. In this case, this function returns a

// handle here that completes when the Burst jobs finish.

return new JobHandle();

}

}

[BurstCompile]

partial struct RandomMoveJob : IJobParallelFor

{

[ReadOnly]

public Unity.Mathematics.Random random;

[ReadOnly]

public float4 randomPostionRange;

[ReadOnly]

public float m_DeltaTime;

public NativeArray matrices;

public NativeArray targetMovePoints;

public NativeArray obj2WorldArr;

public NativeArray world2ObjArr;

[BurstCompile]

public void Execute(int index)

{

float3 curPos = matrices[index].GetPosition();

float3 dir = targetMovePoints[index] - curPos;

if (Unity.Mathematics.math.lengthsq(dir) < 0.4f)

{

var newTargetPos = targetMovePoints[index];

newTargetPos.x = random.NextFloat(randomPostionRange.x, randomPostionRange.y);

newTargetPos.z = random.NextFloat(randomPostionRange.z, randomPostionRange.w);

targetMovePoints[index] = newTargetPos;

}

dir = math.normalizesafe(targetMovePoints[index] - curPos, Vector3.forward);

curPos += dir * m_DeltaTime;// math.lerp(curPos, targetMovePoints[index], m_DeltaTime);

var mat = matrices[index];

mat.SetTRS(curPos, Quaternion.LookRotation(dir), Vector3.one);

matrices[index] = mat;

var item = obj2WorldArr[index];

item.SetData(mat);

obj2WorldArr[index] = item;

item = world2ObjArr[index];

item.SetData(mat.inverse);

world2ObjArr[index] = item;

}

}

推荐阅读

评论可见,请评论后查看内容,谢谢!!!评论后请刷新页面。