前面几篇都是关于cuda的基础知识介绍,甚是乏味。是时候用cuda渲染一张图片来直观地感受一下了!
Kevin Beason大神曾经写过一个99行的c++版本的path tracer。麻雀虽小,五脏俱全,这个path tracer包含了很多global illumination的特性:
- Monte Carlo samling
- OpenMP
- Specular, Diffuse, and glass BRDFs
- Antialising via super-sampling with importance-sampled tent distribution
- Cosine importance sampling of the hemisphere for diffuse reflection
- Russian roulette for path termination
- Russian roulette and splitting for selecting reflection and/or refraction for glass BRDF
为了方便起见,我去掉了不必要的特性,只保留了diffuse材质。源代码请从官网上获取。修改后的代码如下:
smallPT.cpp
#include <cmath>
#include <cstdio>
#include <chrono>
#define BOUNCE 4
#define WIDTH 800
#define HEIGHT 600
#define SPP 1024
struct Vec
{
double x, y, z;
Vec(double x_ = 0, double y_ = 0, double z_ = 0) { x = x_; y = y_; z = z_; }
Vec operator+(const Vec &b) const { return Vec(x + b.x, y + b.y, z + b.z); }
Vec operator-(const Vec &b) const { return Vec(x - b.x, y - b.y, z - b.z); }
Vec operator*(double b) const { return Vec(x*b, y*b, z*b); }
Vec mult(const Vec &b) const { return Vec(x*b.x, y*b.y, z*b.z); }
Vec& norm() { return *this = *this * (1 / sqrt(x*x + y*y + z*z)); }
double dot(const Vec &b) const { return x*b.x + y*b.y + z*b.z; }
Vec operator%(Vec&b) { return Vec(y*b.z - z*b.y, z*b.x - x*b.z, x*b.y - y*b.x); }
};
struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };
enum Refl_t { DIFF, SPEC, REFR }; // diffuse, specular, refraction
struct Sphere
{
double rad;
Vec p, e, c; // position, emission, color
Refl_t refl;
Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_) :
rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
double intersect(const Ray &r) const
{
Vec op = p - r.o;
double t, eps = 1e-4, b = op.dot(r.d), det = b*b - op.dot(op) + rad*rad;
if (det < 0) return 0; else det = sqrt(det);
return (t = b - det) > eps ? t : ((t = b + det) > eps ? t : 0);
}
};
Sphere spheres[] =
{
Sphere(1e5, Vec(-1e5f - 50.0f, 40.0f, 80.0f), Vec(),Vec(.75,.25,.25),DIFF), // Left
Sphere(1e5, Vec(1e5f + 50.0f, 40.0f, 80.0f),Vec(),Vec(.25,.25,.75),DIFF), // Rght
Sphere(1e5, Vec(0.0f, 40.0f, -1e5f), Vec(),Vec(.75,.75,.75),DIFF), // Back
Sphere(1e5, Vec(0.0f, 40.0f, 1e5f + 600.0f), Vec(),Vec(1, 1, 1), DIFF), // Frnt
Sphere(1e5, Vec(0.0f, -1e5f, 80.0f ), Vec(),Vec(.75,.75,.75),DIFF), // Botm
Sphere(1e5, Vec(0.0f, 1e5f + 80.0f, 80.0f),Vec(),Vec(.75,.75,.75),DIFF), // Top
Sphere(16,Vec(-25.0f, 16.0f, 47.0f), Vec(),Vec(1,1,1), DIFF), // Mirr
Sphere(20,Vec(25.0f, 20.0f, 78.0f), Vec(),Vec(1,1,1), DIFF), // Glas
Sphere(600, Vec(0.0f, 678.8f, 80.0f),Vec(1.6f, 1.6f, 1.6f), Vec(), DIFF) // Lite
};
float rand(unsigned int *seed0, unsigned int *seed1)
{
*seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
*seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);
unsigned int ires = ((*seed0) << 16) + (*seed1);
union
{
float f;
unsigned int ui;
} res;
res.ui = (ires & 0x007fffff) | 0x40000000; // bitwise AND, bitwise OR
return (res.f - 2.f) / 2.f;
}
inline double clamp(double x) { return x < 0 ? 0 : x>1 ? 1 : x; }
inline int toInt(double x) { return int(pow(clamp(x), 1 / 2.2) * 255 + .5); }
inline bool intersect(const Ray &r, double &t, int &id) {
double n = sizeof(spheres) / sizeof(Sphere), d, inf = t = 1e20;
for (int i = int(n); i--;) if ((d = spheres[i].intersect(r)) && d < t) { t = d; id = i; }
return t < inf;
}
Vec radiance(const Ray &r, int depth, unsigned int *s0, unsigned int *s1)
{
double t; // distance to intersection
int id = 0; // id of intersected object
if (depth > BOUNCE || !intersect(r, t, id)) return Vec(); // if miss, return black
const Sphere &obj = spheres[id]; // the hit object
Vec x = r.o + r.d*t, n = (x - obj.p).norm(), nl = n.dot(r.d) < 0 ? n : n*-1, f = obj.c;
if (obj.refl == DIFF)
{
double r1 = 2 * M_PI*rand(s0, s1), r2 = rand(s0, s1), r2s = sqrt(r2);
Vec w = nl, u = ((fabs(w.x) > .1 ? Vec(0, 1) : Vec(1)) % w).norm(), v = w%u;
Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1 - r2)).norm();
return obj.e + f.mult(radiance(Ray(x, d), depth + 1, s0, s1)) * n.dot(d) * 2;
}
return Vec();
}
int main()
{
Ray cam(Vec(0, 52, 300), Vec(0, -0.05, -1.0).norm()); // cam pos, dir
Vec cx = Vec(WIDTH*.5135 / HEIGHT), cy = (cx%cam.d).norm()*.5135, r, *c = new Vec[WIDTH*HEIGHT];
std::chrono::time_point<std::chrono::system_clock> begin = std::chrono::system_clock::now();
#pragma omp parallel for schedule(dynamic, 1) private(r) // OpenMP
for (int y = 0; y < HEIGHT; y++) // Loop over image rows
{
fprintf(stderr, "\rRendering (%d spp) %5.2f%%", SPP, 100.*y / (HEIGHT - 1));
for (int x = 0; x < WIDTH; x++) // Loop cols
{
unsigned int s0 = x;
unsigned int s1 = y;
int i = (HEIGHT - y - 1)*WIDTH + x;
r = Vec();
for (int s = 0; s < SPP; s++)
{
Vec d = cx*((0.25 + x) / WIDTH - .5) + cy*((0.25 + y) / HEIGHT - .5) + cam.d;
r = r + radiance(Ray(cam.o + d * 40, d.norm()), 0, &s0, &s1);
}
r = r * (1.0 / SPP);
c[i] = Vec(clamp(r.x), clamp(r.y), clamp(r.z));
}
}
std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsedTime = end - begin;
printf("\nTime: %.6lfs\n", elapsedTime.count());
FILE *f = fopen("image.ppm", "w"); // Write image to PPM file.
fprintf(f, "P3\n%d %d\n%d\n", WIDTH, HEIGHT, 255);
for (int i = 0; i < WIDTH*HEIGHT; i++)
fprintf(f, "%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
}
Note
1.这段代码使用OpenMP进行多线程加速。
在vs2015中需要手动启用OpenMP功能:
2.生成的ppm图片文件可以用ps等软件打开
对应的cuda代码如下:
smallPT.cu
// cuda headers
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
// c++ headers
#include <cstdio>
#include <algorithm>
#include <climits>
#include <chrono>
#include "helper_math.h"
#define WIDTH 800
#define HEIGHT 600
#define SPP 1024
#define BOUNCE 4
#define SPHERE_EPSILON 0.0001f
#define BOX_EPSILON 0.001f
#define RAY_EPSILON 0.05f;
struct Ray
{
__device__ Ray(float3 pos, float3 dir) :
pos(pos), dir(dir) {}
float3 pos;
float3 dir;
};
enum class Material { Diffuse, Specular, Refractive };
struct Sphere
{
__device__ float intersect(const Ray &ray) const
{
float t;
float3 dis = pos - ray.pos;
float proj = dot(dis, ray.dir);
float delta = proj * proj - dot(dis, dis) + radius * radius;
if (delta < 0) return 0;
delta = sqrtf(delta);
return (t = proj - delta) > SPHERE_EPSILON ? t : ((t = proj + delta) > SPHERE_EPSILON ? t : 0);
}
float radius;
float3 pos, emissionColor, mainColor;
Material material;
};
__constant__ Sphere spheres[] =
{
{ 1e5f, { -1e5f - 50.0f, 40.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.25f, 0.25f }, Material::Diffuse }, // Left
{ 1e5f, { 1e5f + 50.0f, 40.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.25f, 0.25f, 0.75f }, Material::Diffuse }, // Right
{ 1e5f, { 0.0f, 40.0f, -1e5f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Back
{ 1e5f, { 0.0f, 40.0f, 1e5f + 600.0f }, { 0.0f, 0.0f, 0.0f }, { 1.00f, 1.00f, 1.00f }, Material::Diffuse }, // Front
{ 1e5f, { 0.0f, -1e5f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Bottom
{ 1e5f, { 0.0f, 1e5f + 80.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Top
{ 16.0f, { -25.0f, 16.0f, 47.0f }, { 0.0f, 0.0f, 0.0f }, { 1.0f, 1.0f, 1.0f }, Material::Diffuse }, // small sphere 1
{ 20.0f, { 25.0f, 20.0f, 78.0f }, { 0.0f, 0.0f, 0.0f }, { 1.0f, 1.0f, 1.0f }, Material::Diffuse }, // small sphere 2
{ 600.0f, { 0.0f, 678.8f, 80.0f }, { 1.6f, 1.6f, 1.6f }, { 0.0f, 0.0f, 0.0f }, Material::Diffuse } // Light
};
__device__ inline bool intersectScene(const Ray &ray, float &t, int &id)
{
t = FLT_MAX, id = -1;
int sphereNum = sizeof(spheres) / sizeof(Sphere);
for (int i = 0; i < sphereNum; i++)
{
float ct = spheres[i].intersect(ray);
if (ct != 0 && ct < t)
{
t = ct;
id = i;
}
}
return id != -1;
}
__device__ static float rand(uint *seed0, uint *seed1)
{
*seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
*seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);
uint ires = ((*seed0) << 16) + (*seed1);
union
{
float f;
uint ui;
} res;
res.ui = (ires & 0x007fffff) | 0x40000000; // bitwise AND, bitwise OR
return (res.f - 2.f) / 2.f;
}
inline int gammaCorrect(float c)
{
return int(pow(clamp(c, 0.0f, 1.0f), 1 / 2.2) * 255 + .5);
}
__device__ float3 pathTrace(Ray &ray, uint *s1, uint *s2)
{
float3 accumColor = make_float3(0.0f, 0.0f, 0.0f);
float3 mask = make_float3(1.0f, 1.0f, 1.0f);
for (int i = 0; i < BOUNCE; i++)
{
float t;
int id;
if (!intersectScene(ray, t, id))
return make_float3(0.0f, 0.0f, 0.0f);
const Sphere &obj = spheres[id];
float3 p = ray.pos + ray.dir * t;
float3 n = normalize(p - obj.pos);
float3 nl = dot(n, ray.dir) < 0 ? n : n * -1;
accumColor += mask * obj.emissionColor;
float r1 = rand(s1, s2) * M_PI * 2;
float r2 = rand(s1, s2);
float r2s = sqrtf(r2);
float3 w = nl;
float3 u = normalize(cross((std::fabs(w.x) > std::fabs(w.y) ? make_float3(0, 1, 0) : make_float3(1, 0, 0)), w));
float3 v = cross(w, u);
float3 d = normalize(u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrtf(1 - r2));
ray.pos = p + nl * RAY_EPSILON;
ray.dir = d;
mask *= obj.mainColor * dot(d, nl) * 2;
}
return accumColor;
}
__global__ void pathTracekernel(float3 *h_output)
{
uint x = blockIdx.x * blockDim.x + threadIdx.x;
uint y = blockIdx.y * blockDim.y + threadIdx.y;
uint i = (HEIGHT - y - 1)*WIDTH + x;
uint s1 = x;
uint s2 = y;
Ray camRay(make_float3(0.0f, 52.0f, 300.0f), normalize(make_float3(0.0f, -0.05f, -1.0f)));
float3 cx = make_float3(WIDTH * 0.5135 / HEIGHT, 0.0f, 0.0f);
float3 cy = normalize(cross(cx, camRay.dir)) * 0.5135;
float3 pixel = make_float3(0.0f);
for (int s = 0; s < SPP; s++)
{
float3 d = camRay.dir + cx*((0.25 + x) / WIDTH - 0.5) + cy*((0.25 + y) / HEIGHT - 0.5);
Ray ray(camRay.pos + d * 40, normalize(d));
pixel += pathTrace(ray, &s1, &s2)*(1. / SPP);
}
h_output[i] = clamp(pixel, 0.0f, 1.0f);
}
int main() {
float3 *h_output = new float3[WIDTH * HEIGHT];
float3 *d_output;
cudaMalloc(&d_output, WIDTH * HEIGHT * sizeof(float3));
dim3 block(8, 8, 1);
dim3 grid(WIDTH / block.x, HEIGHT / block.y, 1);
printf("CUDA initialized.\nStart rendering...\n");
std::chrono::time_point<std::chrono::system_clock> begin = std::chrono::system_clock::now();
pathTracekernel << <grid, block >> > (d_output);
cudaMemcpy(h_output, d_output, WIDTH * HEIGHT * sizeof(float3), cudaMemcpyDeviceToHost);
std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsedTime = end - begin;
printf("Time: %.6lfs\n", elapsedTime.count());
cudaFree(d_output);
printf("Done!\n");
FILE *f = fopen("smallptcuda.ppm", "w");
fprintf(f, "P3\n%d %d\n%d\n", WIDTH, HEIGHT, 255);
for (int i = 0; i < WIDTH * HEIGHT; i++)
fprintf(f, "%d %d %d ", gammaCorrect(h_output[i].x),
gammaCorrect(h_output[i].y),
gammaCorrect(h_output[i].z));
printf("Saved image to 'smallptcuda.ppm'.\n");
delete[] h_output;
return 0;
}
Note
smallPT.cu中包含了一个helper_math.h头文件,该文件实现了float3等cuda基本数据类型的常见操作,例如+ - * dot cross等运算。该头文件位于C:\ProgramData\NVIDIA Corporation\CUDA Samples\v8.0\common\inc目录下。
由于在这段代码里,核函数的单次运行时间会超出系统的默认锁定时间,所以会导致系统认为程序出错而结束进程。详见微软的官网说明。所以我们需要在注册表中修改系统对核函数的监控项。在HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers下添加如下Tdr项:
TdrDelay Specifies the number of seconds that the GPU can delay the preempt request from the GPU scheduler. This is effectively the timeout threshold. The default value is 2 seconds.
TdrDelay可以被理解为核函数单次运行的最长时间。
说明
目前我不会详细解释代码的原理,只是想让大家直观的认识一下path tracing。具体的原理说明会放在之后的教程里。
Profile
参数
图片尺寸(Width*Height):800*600
反射次数(Bounce):4
像素采样数(Spp, Samples per pixel):1024
smallPT.cpp
渲染时间
without OpenMP:595.3s
with OpenMP:107.3s
结果图片
smallPT.cu
渲染时间:6.3s
结果图片
反射次数(Bounce):16
像素采样数(Spp, Samples per pixel):2048
渲染时间:26.9s
结果图片
由于反射次数增加,indirect radiance会增加,场景的亮度也随之增加;像素采样率的提高会使得渲染图片更加平滑。
分析
可以看出,cpu(smallPT.cpp)和gpu(smallPT.cu)的结果图片基本一致(亮度有些差别,可能和float和double的精度有关),但是时间差距很大。在gpu上运行的速度要比在cpu上快107.3/6.3=17倍,比不使用OpenMP多线程加速更要快上595.3/6.3=94倍。所以说,把path tracing这种计算密集型的算法搬到gpu上跑是十分有必要的。
let's step to path tracing now!