初次嘗試GPU Driven —— 大范圍植被渲染
GPU Driver簡單概要,即把整體邏輯放到GPU上運行,解放CPU壓榨GPU,初次嘗試,記錄一下研究過程。
渡神紀
塞爾達
塞爾達
塞爾達
在開放世界游戲里,經常會有大范圍植被渲染,這些花花草草數量驚人,動輒數十上百萬,光看這數字都能感覺到性能壓力撲面而來,那么這些花花草草值得花費如此高昂成本去渲染嗎?究竟是人性的扭曲,還是道德的淪喪?
先寫個初版實現效果
初版實現很簡單,通過一張紋理控制草的生長范圍,把世界坐標映射到紋理UV,從紋理采樣判斷該坐標上是否長草,篩選完所有坐標后,GPU Instance就可以了。這個思路簡單粗暴,一次渲染了整個場景的草,假設這個場景大小是1024 x 1024平方米,每平方米1顆草,那么一次就要渲染100百萬顆草。接下來嘗試優化這個過程,在游戲中,每一個瞬間並不能看到全部內容,視野外的看不見,被擋住的看不見,事實上大部分內容都看不見,在那些看不到的地方渲染的草是多余的,剔除掉這些多余的草則是本例的目的。
本例通過以下4個步驟進行剔除:
- 限定渲染范圍
- 對渲染范圍四叉分割LOD
- 視錐裁剪
- HizMap裁剪
1. 限定渲染范圍
計算出視錐體的包圍盒,用該包圍盒覆蓋的平面范圍去采樣,從而限定渲染范圍,該步驟目的是為了讓渲染范圍僅跟視野范圍相關而不會隨着場景越大渲染范圍越大。
部分代碼:
void UpdateFrustumAABB(Vector3 coord)
{
if (coord.x < mFrustumAABB.x) { mFrustumAABB.x = coord.x; }
else if (coord.x > mFrustumAABB.z) { mFrustumAABB.z = coord.x; }
if (coord.z < mFrustumAABB.y) { mFrustumAABB.y = coord.z; }
else if (coord.z > mFrustumAABB.w) { mFrustumAABB.w = coord.z; }
}
...
var halfFovTan = Mathf.Tan(GrabDepthComp.SelfCamera.fieldOfView * Mathf.Deg2Rad * 0.5f);
var nearHalfH = halfFovTan * GrabDepthComp.SelfCamera.nearClipPlane;
var farHalfH = halfFovTan * GrabDepthComp.SelfCamera.farClipPlane;
var nearToT = nearHalfH * GrabDepthComp.SelfCamera.transform.up;
var nearToR = nearHalfH * GrabDepthComp.SelfCamera.aspect * GrabDepthComp.SelfCamera.transform.right;
var farToT = farHalfH * GrabDepthComp.SelfCamera.transform.up;
var farToR = farHalfH * GrabDepthComp.SelfCamera.aspect * GrabDepthComp.SelfCamera.transform.right;
var nearPosition = GrabDepthComp.SelfCamera.transform.position + GrabDepthComp.SelfCamera.transform.forward * GrabDepthComp.SelfCamera.nearClipPlane;
var farPosition = GrabDepthComp.SelfCamera.transform.position + GrabDepthComp.SelfCamera.transform.forward * GrabDepthComp.SelfCamera.farClipPlane;
mFrustumNearLB = nearPosition - nearToT - nearToR;
mFrustumNearRB = nearPosition - nearToT + nearToR;
mFrustumNearLT = nearPosition + nearToT - nearToR;
mFrustumNearRT = nearPosition + nearToT + nearToR;
mFrustumFarLB = farPosition - farToT - farToR;
mFrustumFarRB = farPosition - farToT + farToR;
mFrustumFarLT = farPosition + farToT - farToR;
mFrustumFarRT = farPosition + farToT + farToR;
// 計算視錐AABB
mFrustumAABB = new Vector4(GrabDepthComp.SelfCamera.transform.position.x, GrabDepthComp.SelfCamera.transform.position.z,
GrabDepthComp.SelfCamera.transform.position.x, GrabDepthComp.SelfCamera.transform.position.z);
UpdateFrustumAABB(mFrustumNearLB);
UpdateFrustumAABB(mFrustumNearRB);
UpdateFrustumAABB(mFrustumNearLT);
UpdateFrustumAABB(mFrustumNearRT);
UpdateFrustumAABB(mFrustumFarLB);
UpdateFrustumAABB(mFrustumFarRB);
UpdateFrustumAABB(mFrustumFarLT);
UpdateFrustumAABB(mFrustumFarRT);
mFrustumAABB.x = Mathf.Clamp(mFrustumAABB.x - FrustumOutDistance, 0, WorldSize);
mFrustumAABB.y = Mathf.Clamp(mFrustumAABB.y - FrustumOutDistance, 0, WorldSize);
mFrustumAABB.z = Mathf.Clamp(mFrustumAABB.z + FrustumOutDistance, 0, WorldSize);
mFrustumAABB.w = Mathf.Clamp(mFrustumAABB.w + FrustumOutDistance, 0, WorldSize);
效果:
2. 對渲染范圍四叉分割LOD
雖然上述步驟限定了渲染范圍,但范圍依舊很大,比如站在高山上看遠方,視野開闊,可視距離遠,但並非可見之處都需要渲染高密度的草叢,因為遠方的草叢看不見細節,只能看到一片綠色,該步驟將渲染范圍拆分多個LOD,近處的高密度渲染,遠處的低密度渲染。(通常會把長草的地面用綠色,從而達到在遠處看,即使沒有草叢也會看到一片綠,可見文章開頭第三張圖片)
四叉樹分割算法大致思路是,若區塊中心到相機的距離短於區塊最長邊,則該區塊需要四叉分割並且LOD+1。
初始的區塊LOD為0,隨着多次細分,LOD逐漸變大,LOD越大則渲染越密集。
部分代碼:
class FrustumTreeNode {
public int LOD;
public Vector4 AABB;
public FrustumTreeNode(int lod, Vector4 aabb)
{
LOD = lod; AABB = aabb;
}
}
...
var cameraCoord = new Vector2(GrabDepthComp.transform.position.x,
GrabDepthComp.transform.position.z);
mFrustumTreeA.Clear();
mFrustumTreeA.Add(new FrustumTreeNode(0, mFrustumAABB));
for (var lod = 0; lod != LODNumber; ++lod)
{
mFrustumTreeB.Clear();
for (var i = 0; i != mFrustumTreeA.Count; ++i)
{
var length = Mathf.Max(mFrustumTreeA[i].AABB.z - mFrustumTreeA[i].AABB.x,
mFrustumTreeA[i].AABB.w - mFrustumTreeA[i].AABB.y);
var center = new Vector2((mFrustumTreeA[i].AABB.x + mFrustumTreeA[i].AABB.z) / 2,
(mFrustumTreeA[i].AABB.y + mFrustumTreeA[i].AABB.w) / 2);
if ((cameraCoord - center).magnitude < length)
{
mFrustumTreeB.Add(new FrustumTreeNode(lod + 1, new Vector4(mFrustumTreeA[i].AABB.x, mFrustumTreeA[i].AABB.y, center.x, center.y)));
mFrustumTreeB.Add(new FrustumTreeNode(lod + 1, new Vector4(center.x, mFrustumTreeA[i].AABB.y, mFrustumTreeA[i].AABB.z, center.y)));
mFrustumTreeB.Add(new FrustumTreeNode(lod + 1, new Vector4(center.x, center.y, mFrustumTreeA[i].AABB.z, mFrustumTreeA[i].AABB.w)));
mFrustumTreeB.Add(new FrustumTreeNode(lod + 1, new Vector4(mFrustumTreeA[i].AABB.x, center.y, center.x, mFrustumTreeA[i].AABB.w)));
}
else
{
mFrustumTreeB.Add(mFrustumTreeA[i]);
}
}
Tools.Swap(ref mFrustumTreeA, ref mFrustumTreeB);
}
效果:
3. 視錐裁剪
前面兩步剔除掉了大量額外渲染,但渲染范圍是通過包圍盒求出的,包圍盒是一個長方體,它除了覆蓋視野范圍還覆蓋了一些多余的范圍,該步驟將剔除這些多余的范圍。
思路是先求出視錐的6個面,隨后在Compute Shader中,對每個單位求出包圍盒,如果這個包圍盒不在視錐體的6個面內,則該單位不可見。
首先每個單位的包圍盒是很好計算的,只要有坐標就可以算出包圍盒,而坐標在第一步中就能拿到。
其次,視錐體的6個面可通過UnityAPI算出,但據說引擎會調用到底層的C++導致一些不必要的開銷,並且該接口需要Plane對象,導致后面對Shader傳參不方便,所以手動算就好了。第一步已經計算出了視錐體的8個頂點,所以拿這8個頂點就能計算6個平面了。
部分代碼:
C#
// 計算視錐裁剪面
// 左右
var lNormal = Vector3.Cross(mFrustumNearLB - mFrustumFarLB, mFrustumNearLT - mFrustumNearLB).normalized;
var rNormal = Vector3.Cross(mFrustumNearRB - mFrustumNearRT, mFrustumFarRB - mFrustumNearRB).normalized;
// 下上
var dNormal = Vector3.Cross(mFrustumNearLB - mFrustumNearRB, mFrustumFarLB - mFrustumNearLB).normalized;
var uNormal = Vector3.Cross(mFrustumNearRT - mFrustumNearLT, mFrustumFarRT - mFrustumNearRT).normalized;
// 近遠
var nNormal = Vector3.Cross(mFrustumNearRB - mFrustumNearLB, mFrustumNearRT - mFrustumNearRB).normalized;
var fNormal = Vector3.Cross(mFrustumFarRB - mFrustumFarRT, mFrustumFarLB - mFrustumFarRB).normalized;
mFrustumPlanes[0] = lNormal; mFrustumPlanes[0].w = Vector3.Dot(lNormal, -mFrustumNearLB);
mFrustumPlanes[1] = rNormal; mFrustumPlanes[1].w = Vector3.Dot(rNormal, -mFrustumNearRB);
mFrustumPlanes[2] = dNormal; mFrustumPlanes[2].w = Vector3.Dot(dNormal, -mFrustumNearLB);
mFrustumPlanes[3] = uNormal; mFrustumPlanes[3].w = Vector3.Dot(uNormal, -mFrustumNearRT);
mFrustumPlanes[4] = nNormal; mFrustumPlanes[4].w = Vector3.Dot(nNormal, -mFrustumNearRB);
mFrustumPlanes[5] = fNormal; mFrustumPlanes[5].w = Vector3.Dot(fNormal, -mFrustumFarRB);
HLSL
bool IsCullByPlane(float3 coord, float width, float4 plane)
{
return dot(coord + float3(-width, -width, -width), plane.xyz) + plane.w <= 0
&& dot(coord + float3( width, -width, -width), plane.xyz) + plane.w <= 0
&& dot(coord + float3( width, width, -width), plane.xyz) + plane.w <= 0
&& dot(coord + float3(-width, width, -width), plane.xyz) + plane.w <= 0
&& dot(coord + float3(-width, -width, width), plane.xyz) + plane.w <= 0
&& dot(coord + float3( width, -width, width), plane.xyz) + plane.w <= 0
&& dot(coord + float3( width, width, width), plane.xyz) + plane.w <= 0
&& dot(coord + float3(-width, width, width), plane.xyz) + plane.w <= 0;
}
bool IsCullByFrustum(float3 coord, float width)
{
return IsCullByPlane(coord, width, _FrustumPlanes[0])
|| IsCullByPlane(coord, width, _FrustumPlanes[1])
|| IsCullByPlane(coord, width, _FrustumPlanes[2])
|| IsCullByPlane(coord, width, _FrustumPlanes[3])
|| IsCullByPlane(coord, width, _FrustumPlanes[4])
|| IsCullByPlane(coord, width, _FrustumPlanes[5]);
}
效果:
1.HizMap裁剪
經過上述三個步驟,幾乎剔除掉了所有多余單位,但在有些情況下,仍然有多余的單位被渲染。例如前面有一堵牆,牆后的單位看不見,但它仍然被渲染,只不過最終沒有呈現在屏幕上,當前步驟則用於剔除牆后的單位。
HizMap的全名叫Hierarchical Z-buffer Map,它主要有兩部分:
- 生成HizMap
- 使用HizMap
生成HizMap
將深度圖生成Mipmaps,但該Mipmaps跟常規不一樣的是,常規方法通過插值迭代每一層,而HizMap是通過取最大(或最小)的值迭代每一層。
使用HizMap
對每一個渲染單位,將包圍盒映射到屏幕空間,再用屏幕空間包圍盒大小計算出對應的LOD,之后用這個LOD和屏幕坐標采樣HizMap,因為HizMap中記錄的是最大(或最小)深度,所以采樣的結果小於(或大於)當前單位的深度,則表示當前單位不可見,可以剔除。
下面用幾張圖概括HizMap的原理:
最終的HLSL
// 分布草
#pragma kernel BuildGrass
#pragma kernel HizMapInit
#pragma kernel HizMapCopy
#pragma kernel HizMapDebug
float2 _DispatchLimit;
// xy: origin
// zw: unit
float4 _GrassInputArgs;
// xy: 世界比貼圖大小
// zw: 偏移距離
float4 _GrassMaskScale;
Texture2D<float4> _GrassMaskTexture;
AppendStructuredBuffer<float4> _GrassCoordOutput;
// Frustum
float4 _FrustumPlanes[6];
// HizMap
Texture2D<float4> _HizMap;
float4x4 _WToViewProj;
float4 _HizMapParams;
bool IsCullByPlane(float3 coord, float width, float4 plane)
{
return dot(coord + float3(-width, -width, -width), plane.xyz) + plane.w <= 0
&& dot(coord + float3( width, -width, -width), plane.xyz) + plane.w <= 0
&& dot(coord + float3( width, width, -width), plane.xyz) + plane.w <= 0
&& dot(coord + float3(-width, width, -width), plane.xyz) + plane.w <= 0
&& dot(coord + float3(-width, -width, width), plane.xyz) + plane.w <= 0
&& dot(coord + float3( width, -width, width), plane.xyz) + plane.w <= 0
&& dot(coord + float3( width, width, width), plane.xyz) + plane.w <= 0
&& dot(coord + float3(-width, width, width), plane.xyz) + plane.w <= 0;
}
bool IsCullByFrustum(float3 coord, float width)
{
return IsCullByPlane(coord, width, _FrustumPlanes[0])
|| IsCullByPlane(coord, width, _FrustumPlanes[1])
|| IsCullByPlane(coord, width, _FrustumPlanes[2])
|| IsCullByPlane(coord, width, _FrustumPlanes[3])
|| IsCullByPlane(coord, width, _FrustumPlanes[4])
|| IsCullByPlane(coord, width, _FrustumPlanes[5]);
}
uint GetHizMapIndex(float2 boundsMin, float2 boundsMax)
{
float2 uv = (boundsMax - boundsMin) * _HizMapParams.x;
uint2 coord = ceil(log2(uv));
uint index = max(coord.x, coord.y);
return min(index,_HizMapParams.y-1);
}
float3 TransformToUVD(float3 coord)
{
float4 ndc = mul(_WToViewProj, float4(coord, 1));
ndc.xyz /= ndc.w;
ndc.xyz = (ndc.xyz + 1) * 0.5f;
ndc.z = 1.0f - ndc.z;
return ndc.xyz;
}
// Z: 1~0
bool IsCullByHizMap(float3 coord, float width)
{
float3 uvd0 = TransformToUVD(coord + float3(-width, -width, -width));
float3 uvd1 = TransformToUVD(coord + float3( width, -width, -width));
float3 uvd2 = TransformToUVD(coord + float3( width, width, -width));
float3 uvd3 = TransformToUVD(coord + float3(-width, width, -width));
float3 uvd4 = TransformToUVD(coord + float3(-width, -width, width));
float3 uvd5 = TransformToUVD(coord + float3( width, -width, width));
float3 uvd6 = TransformToUVD(coord + float3( width, width, width));
float3 uvd7 = TransformToUVD(coord + float3(-width, width, width));
float3 min0 = min(min(uvd0, uvd1), min(uvd2, uvd3));
float3 min1 = min(min(uvd4, uvd5), min(uvd6, uvd7));
float3 boundsMin = min(min0, min1);
float3 max0 = max(max(uvd0, uvd1), max(uvd2, uvd3));
float3 max1 = max(max(uvd4, uvd5), max(uvd6, uvd7));
float3 boundsMax = max(max0, max1);
uint mip = GetHizMapIndex(boundsMin.xy, boundsMax.xy);
float hizMapWidth = _HizMapParams.x / pow(2, mip);
float2 uv0 = min(hizMapWidth - 1, floor(boundsMin.xy * hizMapWidth));
float2 uv1 = min(hizMapWidth - 1, floor(boundsMax.xy * hizMapWidth));
float d0 = _HizMap.mips[mip][uv0].r;
float d1 = _HizMap.mips[mip][uv1].r;
return boundsMax.z < d0 && boundsMax.z < d1;
}
[numthreads(8, 8, 1)]
void BuildGrass (uint3 id : SV_DispatchThreadID)
{
if ((float)id.x < _DispatchLimit.x && (float)id.y < _DispatchLimit.y)
{
float2 worldCoord = _GrassInputArgs.xy
+ _GrassInputArgs.zw
+ _GrassInputArgs.zw * id.xy;
float2 maskIndex = floor(worldCoord * _GrassMaskScale.xy);
float4 maskValue = _GrassMaskTexture[maskIndex];
if (maskValue.b > 0)
{
float2 offset = maskValue.xy * 2.0f - 1.0f;
worldCoord.xy += _GrassMaskScale.zw*offset;
float4 oCoord = worldCoord.xxyy * float4(1, 0, 1, 0);
if (!IsCullByFrustum(oCoord.xyz, 0.5) &&
!IsCullByHizMap(oCoord.xyz, 0.5))
{
_GrassCoordOutput.Append(oCoord);
}
}
}
}
int _HizMapMip;
Texture2D<float4> _CameraDepth;
Texture2D<float4> _HizMapIn;
RWTexture2D<float4> _HizMapOut0;
RWTexture2D<float4> _HizMapOut1;
[numthreads(8, 8, 1)]
void HizMapInit (uint3 id : SV_DispatchThreadID)
{
if ((float)id.x < _DispatchLimit.x && (float)id.y < _DispatchLimit.y)
{
float2 index = floor(id.xy / _HizMapParams.zw);
float4 value = _CameraDepth[index];
_HizMapOut0[id.xy] = value;
_HizMapOut1[id.xy] = value;
}
}
[numthreads(8, 8, 1)]
void HizMapCopy (uint3 id : SV_DispatchThreadID)
{
if ((float)id.x < _DispatchLimit.x && (float)id.y < _DispatchLimit.y)
{
float2 uv = floor(id.xy * 2);
float a = _HizMapIn[uv ].r;
float b = _HizMapIn[uv + float2(1, 0)].r;
float c = _HizMapIn[uv + float2(0, 1)].r;
float d = _HizMapIn[uv + float2(1, 1)].r;
float v = min(min(a, b), min(c, d));
_HizMapOut0[id.xy] = v;
_HizMapOut1[id.xy] = v;
}
}
最終效果:
從上圖Game視圖左上角可以看到,渲染1024 x 1024 x 1的植被,經過剔除后,每幀渲染大概1100~4000之間,比起100萬,簡直少了太多。因為有HizMap,減少到0也不是不可能,Demo場景只是一個平面,簡單擺放了一些遮擋,在實際游戲場景中,遮擋物遠比Demo中復雜,HizMap能發揮更好的效果。
總結
游戲里放大量植被,既是人性扭曲,也是道德淪喪,萬惡之源來自人類聰明的小腦殼,為什么會有這么多知識點,我只想躺平,我不想再學習了。