教你一步步写一个cuda path tracer：牛刀小试！

前面几篇都是关于cuda的基础知识介绍，甚是乏味。是时候用cuda渲染一张图片来直观地感受一下了！

Kevin Beason大神曾经写过一个99行的c++版本的path tracer。麻雀虽小，五脏俱全，这个path tracer包含了很多global illumination的特性：

Monte Carlo samling

OpenMP
Specular, Diffuse, and glass BRDFs
Antialising via super-sampling with importance-sampled tent distribution
Cosine importance sampling of the hemisphere for diffuse reflection
Russian roulette for path termination
Russian roulette and splitting for selecting reflection and/or refraction for glass BRDF

为了方便起见，我去掉了不必要的特性，只保留了diffuse材质。源代码请从官网上获取。修改后的代码如下：

smallPT.cpp

#include <cmath>
#include <cstdio>
#include <chrono>

#define BOUNCE 4
#define WIDTH 800
#define HEIGHT 600
#define SPP 1024

struct Vec
{
    double x, y, z;
    Vec(double x_ = 0, double y_ = 0, double z_ = 0) { x = x_; y = y_; z = z_; }
    Vec operator+(const Vec &b) const { return Vec(x + b.x, y + b.y, z + b.z); }
    Vec operator-(const Vec &b) const { return Vec(x - b.x, y - b.y, z - b.z); }
    Vec operator*(double b) const { return Vec(x*b, y*b, z*b); }
    Vec mult(const Vec &b) const { return Vec(x*b.x, y*b.y, z*b.z); }
    Vec& norm() { return *this = *this * (1 / sqrt(x*x + y*y + z*z)); }
    double dot(const Vec &b) const { return x*b.x + y*b.y + z*b.z; }
    Vec operator%(Vec&b) { return Vec(y*b.z - z*b.y, z*b.x - x*b.z, x*b.y - y*b.x); }
};

struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };

enum Refl_t { DIFF, SPEC, REFR }; // diffuse, specular, refraction

struct Sphere
{
    double rad;
    Vec p, e, c; // position, emission, color
    Refl_t refl;
    Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_) :
        rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
    double intersect(const Ray &r) const
    {
        Vec op = p - r.o;
        double t, eps = 1e-4, b = op.dot(r.d), det = b*b - op.dot(op) + rad*rad;
        if (det < 0) return 0; else det = sqrt(det);
        return (t = b - det) > eps ? t : ((t = b + det) > eps ? t : 0);
    }
};

Sphere spheres[] =
{
    Sphere(1e5, Vec(-1e5f - 50.0f, 40.0f, 80.0f), Vec(),Vec(.75,.25,.25),DIFF), // Left 
    Sphere(1e5, Vec(1e5f + 50.0f, 40.0f, 80.0f),Vec(),Vec(.25,.25,.75),DIFF), // Rght 
    Sphere(1e5, Vec(0.0f, 40.0f, -1e5f),     Vec(),Vec(.75,.75,.75),DIFF), // Back 
    Sphere(1e5, Vec(0.0f, 40.0f, 1e5f + 600.0f), Vec(),Vec(1, 1, 1), DIFF), // Frnt 
    Sphere(1e5, Vec(0.0f, -1e5f, 80.0f ),    Vec(),Vec(.75,.75,.75),DIFF), // Botm 
    Sphere(1e5, Vec(0.0f, 1e5f + 80.0f, 80.0f),Vec(),Vec(.75,.75,.75),DIFF), // Top 
    Sphere(16,Vec(-25.0f, 16.0f, 47.0f),       Vec(),Vec(1,1,1), DIFF), // Mirr 
    Sphere(20,Vec(25.0f, 20.0f, 78.0f),       Vec(),Vec(1,1,1), DIFF), // Glas 
    Sphere(600, Vec(0.0f, 678.8f, 80.0f),Vec(1.6f, 1.6f, 1.6f),  Vec(), DIFF) // Lite 
};

float rand(unsigned int *seed0, unsigned int *seed1)
{
    *seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
    *seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);

    unsigned int ires = ((*seed0) << 16) + (*seed1);

    union
    {
        float f;
        unsigned int ui;
    } res;

    res.ui = (ires & 0x007fffff) | 0x40000000;  // bitwise AND, bitwise OR

    return (res.f - 2.f) / 2.f;
}

inline double clamp(double x) { return x < 0 ? 0 : x>1 ? 1 : x; }
inline int toInt(double x) { return int(pow(clamp(x), 1 / 2.2) * 255 + .5); }
inline bool intersect(const Ray &r, double &t, int &id) {
    double n = sizeof(spheres) / sizeof(Sphere), d, inf = t = 1e20;
    for (int i = int(n); i--;) if ((d = spheres[i].intersect(r)) && d < t) { t = d; id = i; }
    return t < inf;
}

Vec radiance(const Ray &r, int depth, unsigned int *s0, unsigned int *s1)
{
    double t;                               // distance to intersection 
    int id = 0;                               // id of intersected object 
    if (depth > BOUNCE || !intersect(r, t, id)) return Vec(); // if miss, return black 
    const Sphere &obj = spheres[id];        // the hit object 
    Vec x = r.o + r.d*t, n = (x - obj.p).norm(), nl = n.dot(r.d) < 0 ? n : n*-1, f = obj.c;

    if (obj.refl == DIFF)
    {
        double r1 = 2 * M_PI*rand(s0, s1), r2 = rand(s0, s1), r2s = sqrt(r2);
        Vec w = nl, u = ((fabs(w.x) > .1 ? Vec(0, 1) : Vec(1)) % w).norm(), v = w%u;
        Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1 - r2)).norm();
        return obj.e + f.mult(radiance(Ray(x, d), depth + 1, s0, s1)) * n.dot(d) * 2;
    }
    return Vec();
}

int main()
{
    Ray cam(Vec(0, 52, 300), Vec(0, -0.05, -1.0).norm()); // cam pos, dir 
    Vec cx = Vec(WIDTH*.5135 / HEIGHT), cy = (cx%cam.d).norm()*.5135, r, *c = new Vec[WIDTH*HEIGHT];

    std::chrono::time_point<std::chrono::system_clock> begin = std::chrono::system_clock::now();
#pragma omp parallel for schedule(dynamic, 1) private(r)       // OpenMP 
    for (int y = 0; y < HEIGHT; y++) // Loop over image rows 
    {
        fprintf(stderr, "\rRendering (%d spp) %5.2f%%", SPP, 100.*y / (HEIGHT - 1));
        for (int x = 0; x < WIDTH; x++)   // Loop cols
        {
            unsigned int s0 = x;
            unsigned int s1 = y;
            int i = (HEIGHT - y - 1)*WIDTH + x;
            r = Vec();
            for (int s = 0; s < SPP; s++)
            {
                Vec d = cx*((0.25 + x) / WIDTH - .5) + cy*((0.25 + y) / HEIGHT - .5) + cam.d;
                r = r + radiance(Ray(cam.o + d * 40, d.norm()), 0, &s0, &s1);
            }
            r = r * (1.0 / SPP);
            c[i] = Vec(clamp(r.x), clamp(r.y), clamp(r.z));
        }
    }
    std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
    std::chrono::duration<double> elapsedTime = end - begin;
    printf("\nTime: %.6lfs\n", elapsedTime.count());

    FILE *f = fopen("image.ppm", "w");         // Write image to PPM file. 
    fprintf(f, "P3\n%d %d\n%d\n", WIDTH, HEIGHT, 255);
    for (int i = 0; i < WIDTH*HEIGHT; i++)
        fprintf(f, "%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
}

Note

1.这段代码使用OpenMP进行多线程加速。

OpenMP（Open Multi-Processing）是一套支持跨平台共享内存方式的多线程并发的编程API

在vs2015中需要手动启用OpenMP功能：

vs2015中打开OpenMP选项

2.生成的ppm图片文件可以用ps等软件打开

对应的cuda代码如下：

smallPT.cu

// cuda headers
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

// c++ headers
#include <cstdio>
#include <algorithm>
#include <climits>
#include <chrono>

#include "helper_math.h"

#define WIDTH 800
#define HEIGHT 600
#define SPP 1024
#define BOUNCE 4
#define SPHERE_EPSILON 0.0001f
#define BOX_EPSILON 0.001f
#define RAY_EPSILON 0.05f;

struct Ray 
{
    __device__ Ray(float3 pos, float3 dir) :
        pos(pos), dir(dir) {}

    float3 pos;
    float3 dir;
};

enum class Material { Diffuse, Specular, Refractive };

struct Sphere 
{
    __device__ float intersect(const Ray &ray) const
    {
        float t;
        float3 dis = pos - ray.pos;
        float proj = dot(dis, ray.dir);
        float delta = proj * proj - dot(dis, dis) + radius * radius;

        if (delta < 0) return 0;

        delta = sqrtf(delta);
        return (t = proj - delta) > SPHERE_EPSILON ? t : ((t = proj + delta) > SPHERE_EPSILON ? t : 0);
    }

    float radius;
    float3 pos, emissionColor, mainColor;
    Material material;
};

__constant__ Sphere spheres[] =
{
    { 1e5f, { -1e5f - 50.0f, 40.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.25f, 0.25f }, Material::Diffuse }, // Left 
    { 1e5f, { 1e5f + 50.0f, 40.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.25f, 0.25f, 0.75f }, Material::Diffuse }, // Right 
    { 1e5f, { 0.0f, 40.0f, -1e5f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Back 
    { 1e5f, { 0.0f, 40.0f, 1e5f + 600.0f }, { 0.0f, 0.0f, 0.0f }, { 1.00f, 1.00f, 1.00f }, Material::Diffuse }, // Front 
    { 1e5f, { 0.0f, -1e5f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Bottom 
    { 1e5f, { 0.0f, 1e5f + 80.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Top 
    { 16.0f, { -25.0f, 16.0f, 47.0f }, { 0.0f, 0.0f, 0.0f }, { 1.0f, 1.0f, 1.0f }, Material::Diffuse }, // small sphere 1
    { 20.0f, { 25.0f, 20.0f, 78.0f }, { 0.0f, 0.0f, 0.0f }, { 1.0f, 1.0f, 1.0f }, Material::Diffuse }, // small sphere 2
    { 600.0f, { 0.0f, 678.8f, 80.0f }, { 1.6f, 1.6f, 1.6f }, { 0.0f, 0.0f, 0.0f }, Material::Diffuse }  // Light
};

__device__ inline bool intersectScene(const Ray &ray, float &t, int &id) 
{
    t = FLT_MAX, id = -1;
    int sphereNum = sizeof(spheres) / sizeof(Sphere);
    for (int i = 0; i < sphereNum; i++)
    {
        float ct = spheres[i].intersect(ray);
        if (ct != 0 && ct < t)
        {  
            t = ct;
            id = i;
        }
    }
        
    return id != -1;
}

__device__ static float rand(uint *seed0, uint *seed1) 
{
    *seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
    *seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);

    uint ires = ((*seed0) << 16) + (*seed1);

    union 
    {
        float f;
        uint ui;
    } res;

    res.ui = (ires & 0x007fffff) | 0x40000000;  // bitwise AND, bitwise OR

    return (res.f - 2.f) / 2.f;
}

inline int gammaCorrect(float c)
{
    return int(pow(clamp(c, 0.0f, 1.0f), 1 / 2.2) * 255 + .5);
}

__device__ float3 pathTrace(Ray &ray, uint *s1, uint *s2) 
{ 
    float3 accumColor = make_float3(0.0f, 0.0f, 0.0f);
    float3 mask = make_float3(1.0f, 1.0f, 1.0f);

    for (int i = 0; i < BOUNCE; i++)
    {
        float t;
        int id;

        if (!intersectScene(ray, t, id))
            return make_float3(0.0f, 0.0f, 0.0f);

        const Sphere &obj = spheres[id];
        float3 p = ray.pos + ray.dir * t; 
        float3 n = normalize(p - obj.pos);
        float3 nl = dot(n, ray.dir) < 0 ? n : n * -1;

        accumColor += mask * obj.emissionColor;

        float r1 = rand(s1, s2) * M_PI * 2;
        float r2 = rand(s1, s2);
        float r2s = sqrtf(r2);

        float3 w = nl;
        float3 u = normalize(cross((std::fabs(w.x) > std::fabs(w.y) ? make_float3(0, 1, 0) : make_float3(1, 0, 0)), w));
        float3 v = cross(w, u);

        float3 d = normalize(u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrtf(1 - r2));

        ray.pos = p + nl * RAY_EPSILON;
        ray.dir = d;

        mask *= obj.mainColor * dot(d, nl) * 2;
    }

    return accumColor;
}

__global__ void pathTracekernel(float3 *h_output) 
{
    uint x = blockIdx.x * blockDim.x + threadIdx.x;
    uint y = blockIdx.y * blockDim.y + threadIdx.y;

    uint i = (HEIGHT - y - 1)*WIDTH + x;

    uint s1 = x;
    uint s2 = y;

    Ray camRay(make_float3(0.0f, 52.0f, 300.0f), normalize(make_float3(0.0f, -0.05f, -1.0f)));
    float3 cx = make_float3(WIDTH * 0.5135 / HEIGHT, 0.0f, 0.0f);
    float3 cy = normalize(cross(cx, camRay.dir)) * 0.5135;
    float3 pixel = make_float3(0.0f);

    for (int s = 0; s < SPP; s++) 
    {  
        float3 d = camRay.dir + cx*((0.25 + x) / WIDTH - 0.5) + cy*((0.25 + y) / HEIGHT - 0.5);
        Ray ray(camRay.pos + d * 40, normalize(d));

        pixel += pathTrace(ray, &s1, &s2)*(1. / SPP);
    }

    h_output[i] = clamp(pixel, 0.0f, 1.0f);
}

int main() {

    float3 *h_output = new float3[WIDTH * HEIGHT];
    float3 *d_output;

    cudaMalloc(&d_output, WIDTH * HEIGHT * sizeof(float3));

    dim3 block(8, 8, 1);
    dim3 grid(WIDTH / block.x, HEIGHT / block.y, 1);

    printf("CUDA initialized.\nStart rendering...\n");

    std::chrono::time_point<std::chrono::system_clock> begin = std::chrono::system_clock::now();

    pathTracekernel << <grid, block >> > (d_output);

    cudaMemcpy(h_output, d_output, WIDTH * HEIGHT * sizeof(float3), cudaMemcpyDeviceToHost);

    std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
    std::chrono::duration<double> elapsedTime = end - begin;
    printf("Time: %.6lfs\n", elapsedTime.count());

    cudaFree(d_output);

    printf("Done!\n");

    FILE *f = fopen("smallptcuda.ppm", "w");
    fprintf(f, "P3\n%d %d\n%d\n", WIDTH, HEIGHT, 255);
    for (int i = 0; i < WIDTH * HEIGHT; i++)
        fprintf(f, "%d %d %d ", gammaCorrect(h_output[i].x),
            gammaCorrect(h_output[i].y),
            gammaCorrect(h_output[i].z));

    printf("Saved image to 'smallptcuda.ppm'.\n");

    delete[] h_output;

    return 0;
}

Note

smallPT.cu中包含了一个helper_math.h头文件，该文件实现了float3等cuda基本数据类型的常见操作，例如+ - * dot cross等运算。该头文件位于C:\ProgramData\NVIDIA Corporation\CUDA Samples\v8.0\common\inc目录下。
由于在这段代码里，核函数的单次运行时间会超出系统的默认锁定时间，所以会导致系统认为程序出错而结束进程。详见微软的官网说明。所以我们需要在注册表中修改系统对核函数的监控项。在HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers下添加如下Tdr项：

TdrDelay

TdrDelay Specifies the number of seconds that the GPU can delay the preempt request from the GPU scheduler. This is effectively the timeout threshold. The default value is 2 seconds.

TdrDelay可以被理解为核函数单次运行的最长时间。

说明

目前我不会详细解释代码的原理，只是想让大家直观的认识一下path tracing。具体的原理说明会放在之后的教程里。

Profile

参数

图片尺寸（Width*Height）：800*600
反射次数（Bounce）：4
像素采样数（Spp, Samples per pixel）：1024

smallPT.cpp

渲染时间
without OpenMP：595.3s

不用OpenMP加速

with OpenMP：107.3s

OpenMP加速

结果图片

cpu渲染图

smallPT.cu

渲染时间：6.3s

结果图片

gpu渲染图

反射次数（Bounce）：16
像素采样数（Spp, Samples per pixel）：2048

渲染时间：26.9s

结果图片

spp:2048, bounce:8

由于反射次数增加，indirect radiance会增加，场景的亮度也随之增加；像素采样率的提高会使得渲染图片更加平滑。

分析

可以看出，cpu（smallPT.cpp）和gpu（smallPT.cu）的结果图片基本一致（亮度有些差别，可能和float和double的精度有关），但是时间差距很大。在gpu上运行的速度要比在cpu上快107.3/6.3=17倍，比不使用OpenMP多线程加速更要快上595.3/6.3=94倍。所以说，把path tracing这种计算密集型的算法搬到gpu上跑是十分有必要的。

let's step to path tracing now!