Commit cc22ab83 authored by Radek Ošlejšek's avatar Radek Ošlejšek
Browse files

Merge branch '315-poisson-polish' into 'master'

GPU Poisson disk sub-sampling (finalization)

Closes #315 and #312

See merge request grp-fidentis/analyst2!341
parents 24d0e424 6ea4b83e
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -7,7 +7,7 @@ import com.jogamp.opencl.CLContext;
import java.nio.Buffer;

/**
 * Buffer to be filled by GPU
 * Buffer in GPU memory. Read-Only on host side.
 *
 * @param <T> Any type to be stored in buffer
 * @author Marek Horský
+1 −1
Original line number Diff line number Diff line
@@ -6,7 +6,7 @@ import java.nio.FloatBuffer;
import java.util.List;

/**
 * Resizable buffer to write data to GPU
 * Resizable buffer of GPU memory. Can be populated on host-side.
 *
 * @param <T> Any type to be stored in buffer
 * @author Marek Horský
+7 −16
Original line number Diff line number Diff line
@@ -26,13 +26,10 @@ public interface CommonKernelServices extends CLResources {
     * @param vertexBuffer Buffer of vertices. It is not modified during the process
     * @return Buffer of 8 floats containing the bounding box
     */
    static CLBuffer<FloatBuffer> calculateBBox(CLContext clContext, WriteBufferGPU<Point3d> vertexBuffer) {
        CLCommandQueue queue = clContext.getMaxFlopsDevice().createCommandQueue();
    static CLBuffer<FloatBuffer> calculateBBox(CLContext clContext, CLCommandQueue queue, WriteBufferGPU<Point3d> vertexBuffer) {
        CLProgram kernelServicesProgram = OpenCLServices.useProgram(clContext, CLProgramDef.REDUCTION_SERVICES);

        int bboxSize = Math.max(8, vertexBuffer.getCount() / 32); // Determines buffer size requirements
        int bboxSize = Math.max(8, vertexBuffer.getCount() / 16); // Determines buffer size requirements
        CLBuffer<FloatBuffer> octreeBBoxes = clContext.createFloatBuffer(bboxSize, CLMemory.Mem.READ_WRITE);

        int threadCount = getNearestGreaterMultiple(vertexBuffer.getCount(), MAX_GROUP_SIZE);

        CLKernel copyAndMergeBoundingBoxes = kernelServicesProgram.getKernel("copyAndMergeBoundingBoxes")
@@ -66,7 +63,6 @@ public interface CommonKernelServices extends CLResources {

        copyAndMergeBoundingBoxes.release();
        mergeBoundingBoxes.release();
        queue.release();
        return octreeBBox;
    }

@@ -77,12 +73,11 @@ public interface CommonKernelServices extends CLResources {
     * @param count     Size of the buffer
     * @return largest integer
     */
    static int getLargestInteger(CLContext clContext, CLBuffer<IntBuffer> intBuffer, int count) {
        CLCommandQueue queue = clContext.getMaxFlopsDevice().createCommandQueue();
    static int getLargestInteger(CLContext clContext, CLCommandQueue queue, CLBuffer<IntBuffer> intBuffer, int count) {
        CLProgram kernelServicesProgram = OpenCLServices.useProgram(clContext, CLProgramDef.REDUCTION_SERVICES);
        CLKernel getLargestInteger = kernelServicesProgram.getKernel("getLargestInteger");
        int shift = 1;
        int threadCount = count;
        CLKernel getLargestInteger = kernelServicesProgram.getKernel("getLargestInteger");
        do {
            threadCount = getNearestGreaterMultiple(threadCount, WARP_GROUP_SIZE);
            getLargestInteger.rewind();
@@ -97,12 +92,11 @@ public interface CommonKernelServices extends CLResources {
            threadCount /= WARP_GROUP_SIZE;
        }
        while (threadCount > 1);

        queue.putReadBuffer(intBuffer, false)
                .finish();

        getLargestInteger.release();
        queue.release();
        return intBuffer.getBuffer().get();
        return intBuffer.getBuffer().get(0);
    }


@@ -113,10 +107,8 @@ public interface CommonKernelServices extends CLResources {
     * @param size   Size of the buffer
     * @param value  to fill the buffer with
     */
    static void initializeBuffer(CLContext clContext, CLBuffer<IntBuffer> buffer, int size, int value) {
        CLCommandQueue queue = clContext.getMaxFlopsDevice().createCommandQueue();
    static void initializeBuffer(CLContext clContext, CLCommandQueue queue, CLBuffer<IntBuffer> buffer, int size, int value) {
        CLProgram kernelServicesProgram = OpenCLServices.useProgram(clContext, CLProgramDef.REDUCTION_SERVICES);

        int threads = getNearestGreaterMultiple(size, WARP_GROUP_SIZE);
        CLKernel initializeMemory = kernelServicesProgram.getKernel("initializeMemory");
        initializeMemory.putArg(size)
@@ -126,7 +118,6 @@ public interface CommonKernelServices extends CLResources {
        queue.put1DRangeKernel(initializeMemory, 0, threads, WARP_GROUP_SIZE)
                .finish();
        initializeMemory.release();
        queue.release();
    }

    /**
+3 −2
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ import java.util.Collection;
/**
 * Creates octree in OpenCL memory
 * Octree is static - Does not support modifications after creation
 * The implementation is thread-safe
 *
 * @author Marek Horský
 */
@@ -32,9 +33,9 @@ public interface OctreeOpenCL extends CLResources {
    }

    /**
     * Builds the tree with provided facets. Reuses allocated memory if possible
     * Builds the octree with provided facets. Reuses allocated memory if possible
     *
     * @param facets
     * @param facets Facets to be loaded in octree
     */
    void build(Collection<MeshFacet> facets);

+19 −35
Original line number Diff line number Diff line
@@ -17,12 +17,17 @@ import cz.fidentis.analyst.opencl.services.octree.OctreeOpenCL;
import javax.vecmath.Point3d;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.time.Instant;
import java.util.Collection;

/**
 * Builds Octree on GPU in Top-Down manner. Mesh Facet triangles are copied to GPU an subdivided into uniform octants
 * Builds Octree on GPU in top-down direction. Mesh Facet triangles are copied to GPU and subdivided into uniform octants
 * until maximum depth constraint is reached.
 * The process involves:
 * Memory allocation / initialization
 * Builds tree structure from vertices.
 * Finds non-empty leaves and largest leaf size
 * Allocates leaf memory
 * Populates structure with triangles
 *
 * @author Marek Horský
 */
@@ -50,14 +55,12 @@ public class OctreeOpenCLImpl implements OctreeOpenCL {
     * @param clContext Desired CL Context
     */
    public OctreeOpenCLImpl(CLContext clContext) {
        long time = Instant.now().toEpochMilli();
        this.clContext = clContext;
        this.queue = clContext.getMaxFlopsDevice().createCommandQueue();
        this.octreeBuilderProgram = OpenCLServices.useProgram(clContext, CLProgramDef.OCTREE_CONSTRUCTION);
        this.facetBuffer = new FacetBuffer(clContext);
        this.maxLeafCount = OCTREE_SIZE;
        prepareBuffers();
        System.out.println("constructor: " + ((Instant.now().toEpochMilli()) - time));
    }

    @Override
@@ -109,24 +112,19 @@ public class OctreeOpenCLImpl implements OctreeOpenCL {
        leaves.release();
    }

    /**
     * Builds octree on GPU. Allocates memory. Creates tree structure from vertices.
     * Finds non-empty leaves and largest leaf. Allocates leaf memory and populates it with triangles
     *
     * @param facets source data
     */
    @Override
    public void build(Collection<MeshFacet> facets) {
        facetBuffer.loadAsynchronously(facets);
        CommonKernelServices.initializeBuffer(clContext, tree.get(), maxLeafCount, -1);
        CommonKernelServices.initializeBuffer(clContext, queue, tree.get(), maxLeafCount, -1);

        vertexThreads = CommonKernelServices.getNearestGreaterMultiple(
                facetBuffer.getVertexBuffer().getCount(),
                CommonKernelServices.WARP_GROUP_SIZE);

        octreeBBox = CommonKernelServices.calculateBBox(clContext, facetBuffer.getVertexBuffer());
        octreeBBox = CommonKernelServices.calculateBBox(clContext, queue, facetBuffer.getVertexBuffer());
        assignVertices(facetBuffer.getVertexBuffer());
        checkTriangles(facetBuffer.getMeshTriangleBuffer());
        largestLeaf = CommonKernelServices.getLargestInteger(clContext, triangleCounters.get(), leafCount);
        largestLeaf = CommonKernelServices.getLargestInteger(clContext, queue, triangleCounters.get(), leafCount);
        assignTriangles(facetBuffer.getMeshTriangleBuffer());
    }

@@ -139,7 +137,7 @@ public class OctreeOpenCLImpl implements OctreeOpenCL {
    }

    /**
     * Generates tree from vertices. Stopping constraint {@code MAX_DEPTH} due to memory constraints and atomic constraints.
     * Generates tree from vertices. Stopping condition {@code MAX_DEPTH} is used due to memory constraints and atomic constraints.
     * Writes tree as indexes to array. Each node is exactly 1 index pointing to his 1st child.
     * The 8 children are stored continuously (Maintains memory locality)
     */
@@ -159,7 +157,7 @@ public class OctreeOpenCLImpl implements OctreeOpenCL {

        maxLeafCount = counter.getBuffer().get(0);
        triangleCounters.resize(maxLeafCount);
        CommonKernelServices.initializeBuffer(clContext, triangleCounters.get(), triangleCounters.getCount(), 0);
        CommonKernelServices.initializeBuffer(clContext, queue, triangleCounters.get(), triangleCounters.getCount(), 0);
        vertexBuild.release();
    }

@@ -171,7 +169,7 @@ public class OctreeOpenCLImpl implements OctreeOpenCL {
     */

    private void checkTriangles(WriteBufferGPU<MeshTriangle> meshTriangleBuffer) {
        counter.getBuffer().put(0, -3); // Leaf counter
        counter.getBuffer().put(0, -3); // Leaf counter ==> Distributes unique leaf indexes
        queue.putWriteBuffer(counter, false);

        triangleThreads = CommonKernelServices.getNearestGreaterMultiple(
@@ -182,8 +180,7 @@ public class OctreeOpenCLImpl implements OctreeOpenCL {
                .putArg(meshTriangleBuffer.getCount())
                .putArgs(octreeBBox, meshTriangleBuffer.get(), tree.get(), triangleCounters.get(), counter);

        queue.finish()
                .put1DRangeKernel(checkTriangles, 0, triangleThreads, CommonKernelServices.WARP_GROUP_SIZE)
        queue.put1DRangeKernel(checkTriangles, 0, triangleThreads, CommonKernelServices.WARP_GROUP_SIZE)
                .putReadBuffer(counter, false)
                .finish();

@@ -195,29 +192,16 @@ public class OctreeOpenCLImpl implements OctreeOpenCL {
     * Traverses tree structure generated from vertices 1 thread per triangle once again, this time stores the triangles correct leaves.
     */
    private void assignTriangles(WriteBufferGPU<MeshTriangle> meshTriangleBuffer) {
        int bufferSize = leafCount * largestLeaf;
        System.out.println(leafCount + " * " + largestLeaf);
        if (bufferSize < 0) {
            System.err.println("Buffer size overflow");
            return;
        }
        if (bufferSize == 0) {
            System.err.println("Buffer size is 0");
            return;
        }
        leaves.resize(bufferSize);

        CommonKernelServices.initializeBuffer(clContext, leaves.get(), leaves.getCount(), -1);
        CommonKernelServices.initializeBuffer(clContext, triangleCounters.get(), triangleCounters.getCount(), 0);

        leaves.resize(leafCount * largestLeaf);
        CommonKernelServices.initializeBuffer(clContext, queue, leaves.get(), leaves.getCount(), -1);
        CommonKernelServices.initializeBuffer(clContext, queue, triangleCounters.get(), triangleCounters.getCount(), 0);

        CLKernel assignTriangles = octreeBuilderProgram.getKernel("assignTriangles")
                .putArg(meshTriangleBuffer.getCount())
                .putArg(largestLeaf)
                .putArgs(octreeBBox, meshTriangleBuffer.get(), tree.get(), triangleCounters.get(), leaves.get());

        queue.finish()
                .put1DRangeKernel(assignTriangles, 0, triangleThreads, CommonKernelServices.WARP_GROUP_SIZE)
        queue.put1DRangeKernel(assignTriangles, 0, triangleThreads, CommonKernelServices.WARP_GROUP_SIZE)
                .finish();
        assignTriangles.release();
    }
Loading