Loading GPU/src/main/java/cz/fidentis/analyst/opencl/memory/BufferGPU.java +1 −1 Original line number Diff line number Diff line Loading @@ -7,7 +7,7 @@ import com.jogamp.opencl.CLContext; import java.nio.Buffer; /** * Buffer to be filled by GPU * Buffer in GPU memory. Read-Only on host side. * * @param <T> Any type to be stored in buffer * @author Marek Horský Loading GPU/src/main/java/cz/fidentis/analyst/opencl/memory/WriteBufferGPU.java +1 −1 Original line number Diff line number Diff line Loading @@ -6,7 +6,7 @@ import java.nio.FloatBuffer; import java.util.List; /** * Resizable buffer to write data to GPU * Resizable buffer of GPU memory. Can be populated on host-side. * * @param <T> Any type to be stored in buffer * @author Marek Horský Loading GPU/src/main/java/cz/fidentis/analyst/opencl/services/common/CommonKernelServices.java +7 −16 Original line number Diff line number Diff line Loading @@ -26,13 +26,10 @@ public interface CommonKernelServices extends CLResources { * @param vertexBuffer Buffer of vertices. It is not modified during the process * @return Buffer of 8 floats containing the bounding box */ static CLBuffer<FloatBuffer> calculateBBox(CLContext clContext, WriteBufferGPU<Point3d> vertexBuffer) { CLCommandQueue queue = clContext.getMaxFlopsDevice().createCommandQueue(); static CLBuffer<FloatBuffer> calculateBBox(CLContext clContext, CLCommandQueue queue, WriteBufferGPU<Point3d> vertexBuffer) { CLProgram kernelServicesProgram = OpenCLServices.useProgram(clContext, CLProgramDef.REDUCTION_SERVICES); int bboxSize = Math.max(8, vertexBuffer.getCount() / 32); // Determines buffer size requirements int bboxSize = Math.max(8, vertexBuffer.getCount() / 16); // Determines buffer size requirements CLBuffer<FloatBuffer> octreeBBoxes = clContext.createFloatBuffer(bboxSize, CLMemory.Mem.READ_WRITE); int threadCount = getNearestGreaterMultiple(vertexBuffer.getCount(), MAX_GROUP_SIZE); CLKernel copyAndMergeBoundingBoxes = kernelServicesProgram.getKernel("copyAndMergeBoundingBoxes") Loading Loading @@ -66,7 +63,6 @@ public interface CommonKernelServices extends CLResources { copyAndMergeBoundingBoxes.release(); mergeBoundingBoxes.release(); queue.release(); return octreeBBox; } Loading @@ -77,12 +73,11 @@ public interface CommonKernelServices extends CLResources { * @param count Size of the buffer * @return largest integer */ static int getLargestInteger(CLContext clContext, CLBuffer<IntBuffer> intBuffer, int count) { CLCommandQueue queue = clContext.getMaxFlopsDevice().createCommandQueue(); static int getLargestInteger(CLContext clContext, CLCommandQueue queue, CLBuffer<IntBuffer> intBuffer, int count) { CLProgram kernelServicesProgram = OpenCLServices.useProgram(clContext, CLProgramDef.REDUCTION_SERVICES); CLKernel getLargestInteger = kernelServicesProgram.getKernel("getLargestInteger"); int shift = 1; int threadCount = count; CLKernel getLargestInteger = kernelServicesProgram.getKernel("getLargestInteger"); do { threadCount = getNearestGreaterMultiple(threadCount, WARP_GROUP_SIZE); getLargestInteger.rewind(); Loading @@ -97,12 +92,11 @@ public interface CommonKernelServices extends CLResources { threadCount /= WARP_GROUP_SIZE; } while (threadCount > 1); queue.putReadBuffer(intBuffer, false) .finish(); getLargestInteger.release(); queue.release(); return intBuffer.getBuffer().get(); return intBuffer.getBuffer().get(0); } Loading @@ -113,10 +107,8 @@ public interface CommonKernelServices extends CLResources { * @param size Size of the buffer * @param value to fill the buffer with */ static void initializeBuffer(CLContext clContext, CLBuffer<IntBuffer> buffer, int size, int value) { CLCommandQueue queue = clContext.getMaxFlopsDevice().createCommandQueue(); static void initializeBuffer(CLContext clContext, CLCommandQueue queue, CLBuffer<IntBuffer> buffer, int size, int value) { CLProgram kernelServicesProgram = OpenCLServices.useProgram(clContext, CLProgramDef.REDUCTION_SERVICES); int threads = getNearestGreaterMultiple(size, WARP_GROUP_SIZE); CLKernel initializeMemory = kernelServicesProgram.getKernel("initializeMemory"); initializeMemory.putArg(size) Loading @@ -126,7 +118,6 @@ public interface CommonKernelServices extends CLResources { queue.put1DRangeKernel(initializeMemory, 0, threads, WARP_GROUP_SIZE) .finish(); initializeMemory.release(); queue.release(); } /** Loading GPU/src/main/java/cz/fidentis/analyst/opencl/services/octree/OctreeOpenCL.java +3 −2 Original line number Diff line number Diff line Loading @@ -17,6 +17,7 @@ import java.util.Collection; /** * Creates octree in OpenCL memory * Octree is static - Does not support modifications after creation * The implementation is thread-safe * * @author Marek Horský */ Loading @@ -32,9 +33,9 @@ public interface OctreeOpenCL extends CLResources { } /** * Builds the tree with provided facets. Reuses allocated memory if possible * Builds the octree with provided facets. Reuses allocated memory if possible * * @param facets * @param facets Facets to be loaded in octree */ void build(Collection<MeshFacet> facets); Loading GPU/src/main/java/cz/fidentis/analyst/opencl/services/octree/impl/OctreeOpenCLImpl.java +19 −35 Original line number Diff line number Diff line Loading @@ -17,12 +17,17 @@ import cz.fidentis.analyst.opencl.services.octree.OctreeOpenCL; import javax.vecmath.Point3d; import java.nio.FloatBuffer; import java.nio.IntBuffer; import java.time.Instant; import java.util.Collection; /** * Builds Octree on GPU in Top-Down manner. Mesh Facet triangles are copied to GPU an subdivided into uniform octants * Builds Octree on GPU in top-down direction. Mesh Facet triangles are copied to GPU and subdivided into uniform octants * until maximum depth constraint is reached. * The process involves: * Memory allocation / initialization * Builds tree structure from vertices. * Finds non-empty leaves and largest leaf size * Allocates leaf memory * Populates structure with triangles * * @author Marek Horský */ Loading Loading @@ -50,14 +55,12 @@ public class OctreeOpenCLImpl implements OctreeOpenCL { * @param clContext Desired CL Context */ public OctreeOpenCLImpl(CLContext clContext) { long time = Instant.now().toEpochMilli(); this.clContext = clContext; this.queue = clContext.getMaxFlopsDevice().createCommandQueue(); this.octreeBuilderProgram = OpenCLServices.useProgram(clContext, CLProgramDef.OCTREE_CONSTRUCTION); this.facetBuffer = new FacetBuffer(clContext); this.maxLeafCount = OCTREE_SIZE; prepareBuffers(); System.out.println("constructor: " + ((Instant.now().toEpochMilli()) - time)); } @Override Loading Loading @@ -109,24 +112,19 @@ public class OctreeOpenCLImpl implements OctreeOpenCL { leaves.release(); } /** * Builds octree on GPU. Allocates memory. Creates tree structure from vertices. * Finds non-empty leaves and largest leaf. Allocates leaf memory and populates it with triangles * * @param facets source data */ @Override public void build(Collection<MeshFacet> facets) { facetBuffer.loadAsynchronously(facets); CommonKernelServices.initializeBuffer(clContext, tree.get(), maxLeafCount, -1); CommonKernelServices.initializeBuffer(clContext, queue, tree.get(), maxLeafCount, -1); vertexThreads = CommonKernelServices.getNearestGreaterMultiple( facetBuffer.getVertexBuffer().getCount(), CommonKernelServices.WARP_GROUP_SIZE); octreeBBox = CommonKernelServices.calculateBBox(clContext, facetBuffer.getVertexBuffer()); octreeBBox = CommonKernelServices.calculateBBox(clContext, queue, facetBuffer.getVertexBuffer()); assignVertices(facetBuffer.getVertexBuffer()); checkTriangles(facetBuffer.getMeshTriangleBuffer()); largestLeaf = CommonKernelServices.getLargestInteger(clContext, triangleCounters.get(), leafCount); largestLeaf = CommonKernelServices.getLargestInteger(clContext, queue, triangleCounters.get(), leafCount); assignTriangles(facetBuffer.getMeshTriangleBuffer()); } Loading @@ -139,7 +137,7 @@ public class OctreeOpenCLImpl implements OctreeOpenCL { } /** * Generates tree from vertices. Stopping constraint {@code MAX_DEPTH} due to memory constraints and atomic constraints. * Generates tree from vertices. Stopping condition {@code MAX_DEPTH} is used due to memory constraints and atomic constraints. * Writes tree as indexes to array. Each node is exactly 1 index pointing to his 1st child. * The 8 children are stored continuously (Maintains memory locality) */ Loading @@ -159,7 +157,7 @@ public class OctreeOpenCLImpl implements OctreeOpenCL { maxLeafCount = counter.getBuffer().get(0); triangleCounters.resize(maxLeafCount); CommonKernelServices.initializeBuffer(clContext, triangleCounters.get(), triangleCounters.getCount(), 0); CommonKernelServices.initializeBuffer(clContext, queue, triangleCounters.get(), triangleCounters.getCount(), 0); vertexBuild.release(); } Loading @@ -171,7 +169,7 @@ public class OctreeOpenCLImpl implements OctreeOpenCL { */ private void checkTriangles(WriteBufferGPU<MeshTriangle> meshTriangleBuffer) { counter.getBuffer().put(0, -3); // Leaf counter counter.getBuffer().put(0, -3); // Leaf counter ==> Distributes unique leaf indexes queue.putWriteBuffer(counter, false); triangleThreads = CommonKernelServices.getNearestGreaterMultiple( Loading @@ -182,8 +180,7 @@ public class OctreeOpenCLImpl implements OctreeOpenCL { .putArg(meshTriangleBuffer.getCount()) .putArgs(octreeBBox, meshTriangleBuffer.get(), tree.get(), triangleCounters.get(), counter); queue.finish() .put1DRangeKernel(checkTriangles, 0, triangleThreads, CommonKernelServices.WARP_GROUP_SIZE) queue.put1DRangeKernel(checkTriangles, 0, triangleThreads, CommonKernelServices.WARP_GROUP_SIZE) .putReadBuffer(counter, false) .finish(); Loading @@ -195,29 +192,16 @@ public class OctreeOpenCLImpl implements OctreeOpenCL { * Traverses tree structure generated from vertices 1 thread per triangle once again, this time stores the triangles correct leaves. */ private void assignTriangles(WriteBufferGPU<MeshTriangle> meshTriangleBuffer) { int bufferSize = leafCount * largestLeaf; System.out.println(leafCount + " * " + largestLeaf); if (bufferSize < 0) { System.err.println("Buffer size overflow"); return; } if (bufferSize == 0) { System.err.println("Buffer size is 0"); return; } leaves.resize(bufferSize); CommonKernelServices.initializeBuffer(clContext, leaves.get(), leaves.getCount(), -1); CommonKernelServices.initializeBuffer(clContext, triangleCounters.get(), triangleCounters.getCount(), 0); leaves.resize(leafCount * largestLeaf); CommonKernelServices.initializeBuffer(clContext, queue, leaves.get(), leaves.getCount(), -1); CommonKernelServices.initializeBuffer(clContext, queue, triangleCounters.get(), triangleCounters.getCount(), 0); CLKernel assignTriangles = octreeBuilderProgram.getKernel("assignTriangles") .putArg(meshTriangleBuffer.getCount()) .putArg(largestLeaf) .putArgs(octreeBBox, meshTriangleBuffer.get(), tree.get(), triangleCounters.get(), leaves.get()); queue.finish() .put1DRangeKernel(assignTriangles, 0, triangleThreads, CommonKernelServices.WARP_GROUP_SIZE) queue.put1DRangeKernel(assignTriangles, 0, triangleThreads, CommonKernelServices.WARP_GROUP_SIZE) .finish(); assignTriangles.release(); } Loading Loading
GPU/src/main/java/cz/fidentis/analyst/opencl/memory/BufferGPU.java +1 −1 Original line number Diff line number Diff line Loading @@ -7,7 +7,7 @@ import com.jogamp.opencl.CLContext; import java.nio.Buffer; /** * Buffer to be filled by GPU * Buffer in GPU memory. Read-Only on host side. * * @param <T> Any type to be stored in buffer * @author Marek Horský Loading
GPU/src/main/java/cz/fidentis/analyst/opencl/memory/WriteBufferGPU.java +1 −1 Original line number Diff line number Diff line Loading @@ -6,7 +6,7 @@ import java.nio.FloatBuffer; import java.util.List; /** * Resizable buffer to write data to GPU * Resizable buffer of GPU memory. Can be populated on host-side. * * @param <T> Any type to be stored in buffer * @author Marek Horský Loading
GPU/src/main/java/cz/fidentis/analyst/opencl/services/common/CommonKernelServices.java +7 −16 Original line number Diff line number Diff line Loading @@ -26,13 +26,10 @@ public interface CommonKernelServices extends CLResources { * @param vertexBuffer Buffer of vertices. It is not modified during the process * @return Buffer of 8 floats containing the bounding box */ static CLBuffer<FloatBuffer> calculateBBox(CLContext clContext, WriteBufferGPU<Point3d> vertexBuffer) { CLCommandQueue queue = clContext.getMaxFlopsDevice().createCommandQueue(); static CLBuffer<FloatBuffer> calculateBBox(CLContext clContext, CLCommandQueue queue, WriteBufferGPU<Point3d> vertexBuffer) { CLProgram kernelServicesProgram = OpenCLServices.useProgram(clContext, CLProgramDef.REDUCTION_SERVICES); int bboxSize = Math.max(8, vertexBuffer.getCount() / 32); // Determines buffer size requirements int bboxSize = Math.max(8, vertexBuffer.getCount() / 16); // Determines buffer size requirements CLBuffer<FloatBuffer> octreeBBoxes = clContext.createFloatBuffer(bboxSize, CLMemory.Mem.READ_WRITE); int threadCount = getNearestGreaterMultiple(vertexBuffer.getCount(), MAX_GROUP_SIZE); CLKernel copyAndMergeBoundingBoxes = kernelServicesProgram.getKernel("copyAndMergeBoundingBoxes") Loading Loading @@ -66,7 +63,6 @@ public interface CommonKernelServices extends CLResources { copyAndMergeBoundingBoxes.release(); mergeBoundingBoxes.release(); queue.release(); return octreeBBox; } Loading @@ -77,12 +73,11 @@ public interface CommonKernelServices extends CLResources { * @param count Size of the buffer * @return largest integer */ static int getLargestInteger(CLContext clContext, CLBuffer<IntBuffer> intBuffer, int count) { CLCommandQueue queue = clContext.getMaxFlopsDevice().createCommandQueue(); static int getLargestInteger(CLContext clContext, CLCommandQueue queue, CLBuffer<IntBuffer> intBuffer, int count) { CLProgram kernelServicesProgram = OpenCLServices.useProgram(clContext, CLProgramDef.REDUCTION_SERVICES); CLKernel getLargestInteger = kernelServicesProgram.getKernel("getLargestInteger"); int shift = 1; int threadCount = count; CLKernel getLargestInteger = kernelServicesProgram.getKernel("getLargestInteger"); do { threadCount = getNearestGreaterMultiple(threadCount, WARP_GROUP_SIZE); getLargestInteger.rewind(); Loading @@ -97,12 +92,11 @@ public interface CommonKernelServices extends CLResources { threadCount /= WARP_GROUP_SIZE; } while (threadCount > 1); queue.putReadBuffer(intBuffer, false) .finish(); getLargestInteger.release(); queue.release(); return intBuffer.getBuffer().get(); return intBuffer.getBuffer().get(0); } Loading @@ -113,10 +107,8 @@ public interface CommonKernelServices extends CLResources { * @param size Size of the buffer * @param value to fill the buffer with */ static void initializeBuffer(CLContext clContext, CLBuffer<IntBuffer> buffer, int size, int value) { CLCommandQueue queue = clContext.getMaxFlopsDevice().createCommandQueue(); static void initializeBuffer(CLContext clContext, CLCommandQueue queue, CLBuffer<IntBuffer> buffer, int size, int value) { CLProgram kernelServicesProgram = OpenCLServices.useProgram(clContext, CLProgramDef.REDUCTION_SERVICES); int threads = getNearestGreaterMultiple(size, WARP_GROUP_SIZE); CLKernel initializeMemory = kernelServicesProgram.getKernel("initializeMemory"); initializeMemory.putArg(size) Loading @@ -126,7 +118,6 @@ public interface CommonKernelServices extends CLResources { queue.put1DRangeKernel(initializeMemory, 0, threads, WARP_GROUP_SIZE) .finish(); initializeMemory.release(); queue.release(); } /** Loading
GPU/src/main/java/cz/fidentis/analyst/opencl/services/octree/OctreeOpenCL.java +3 −2 Original line number Diff line number Diff line Loading @@ -17,6 +17,7 @@ import java.util.Collection; /** * Creates octree in OpenCL memory * Octree is static - Does not support modifications after creation * The implementation is thread-safe * * @author Marek Horský */ Loading @@ -32,9 +33,9 @@ public interface OctreeOpenCL extends CLResources { } /** * Builds the tree with provided facets. Reuses allocated memory if possible * Builds the octree with provided facets. Reuses allocated memory if possible * * @param facets * @param facets Facets to be loaded in octree */ void build(Collection<MeshFacet> facets); Loading
GPU/src/main/java/cz/fidentis/analyst/opencl/services/octree/impl/OctreeOpenCLImpl.java +19 −35 Original line number Diff line number Diff line Loading @@ -17,12 +17,17 @@ import cz.fidentis.analyst.opencl.services.octree.OctreeOpenCL; import javax.vecmath.Point3d; import java.nio.FloatBuffer; import java.nio.IntBuffer; import java.time.Instant; import java.util.Collection; /** * Builds Octree on GPU in Top-Down manner. Mesh Facet triangles are copied to GPU an subdivided into uniform octants * Builds Octree on GPU in top-down direction. Mesh Facet triangles are copied to GPU and subdivided into uniform octants * until maximum depth constraint is reached. * The process involves: * Memory allocation / initialization * Builds tree structure from vertices. * Finds non-empty leaves and largest leaf size * Allocates leaf memory * Populates structure with triangles * * @author Marek Horský */ Loading Loading @@ -50,14 +55,12 @@ public class OctreeOpenCLImpl implements OctreeOpenCL { * @param clContext Desired CL Context */ public OctreeOpenCLImpl(CLContext clContext) { long time = Instant.now().toEpochMilli(); this.clContext = clContext; this.queue = clContext.getMaxFlopsDevice().createCommandQueue(); this.octreeBuilderProgram = OpenCLServices.useProgram(clContext, CLProgramDef.OCTREE_CONSTRUCTION); this.facetBuffer = new FacetBuffer(clContext); this.maxLeafCount = OCTREE_SIZE; prepareBuffers(); System.out.println("constructor: " + ((Instant.now().toEpochMilli()) - time)); } @Override Loading Loading @@ -109,24 +112,19 @@ public class OctreeOpenCLImpl implements OctreeOpenCL { leaves.release(); } /** * Builds octree on GPU. Allocates memory. Creates tree structure from vertices. * Finds non-empty leaves and largest leaf. Allocates leaf memory and populates it with triangles * * @param facets source data */ @Override public void build(Collection<MeshFacet> facets) { facetBuffer.loadAsynchronously(facets); CommonKernelServices.initializeBuffer(clContext, tree.get(), maxLeafCount, -1); CommonKernelServices.initializeBuffer(clContext, queue, tree.get(), maxLeafCount, -1); vertexThreads = CommonKernelServices.getNearestGreaterMultiple( facetBuffer.getVertexBuffer().getCount(), CommonKernelServices.WARP_GROUP_SIZE); octreeBBox = CommonKernelServices.calculateBBox(clContext, facetBuffer.getVertexBuffer()); octreeBBox = CommonKernelServices.calculateBBox(clContext, queue, facetBuffer.getVertexBuffer()); assignVertices(facetBuffer.getVertexBuffer()); checkTriangles(facetBuffer.getMeshTriangleBuffer()); largestLeaf = CommonKernelServices.getLargestInteger(clContext, triangleCounters.get(), leafCount); largestLeaf = CommonKernelServices.getLargestInteger(clContext, queue, triangleCounters.get(), leafCount); assignTriangles(facetBuffer.getMeshTriangleBuffer()); } Loading @@ -139,7 +137,7 @@ public class OctreeOpenCLImpl implements OctreeOpenCL { } /** * Generates tree from vertices. Stopping constraint {@code MAX_DEPTH} due to memory constraints and atomic constraints. * Generates tree from vertices. Stopping condition {@code MAX_DEPTH} is used due to memory constraints and atomic constraints. * Writes tree as indexes to array. Each node is exactly 1 index pointing to his 1st child. * The 8 children are stored continuously (Maintains memory locality) */ Loading @@ -159,7 +157,7 @@ public class OctreeOpenCLImpl implements OctreeOpenCL { maxLeafCount = counter.getBuffer().get(0); triangleCounters.resize(maxLeafCount); CommonKernelServices.initializeBuffer(clContext, triangleCounters.get(), triangleCounters.getCount(), 0); CommonKernelServices.initializeBuffer(clContext, queue, triangleCounters.get(), triangleCounters.getCount(), 0); vertexBuild.release(); } Loading @@ -171,7 +169,7 @@ public class OctreeOpenCLImpl implements OctreeOpenCL { */ private void checkTriangles(WriteBufferGPU<MeshTriangle> meshTriangleBuffer) { counter.getBuffer().put(0, -3); // Leaf counter counter.getBuffer().put(0, -3); // Leaf counter ==> Distributes unique leaf indexes queue.putWriteBuffer(counter, false); triangleThreads = CommonKernelServices.getNearestGreaterMultiple( Loading @@ -182,8 +180,7 @@ public class OctreeOpenCLImpl implements OctreeOpenCL { .putArg(meshTriangleBuffer.getCount()) .putArgs(octreeBBox, meshTriangleBuffer.get(), tree.get(), triangleCounters.get(), counter); queue.finish() .put1DRangeKernel(checkTriangles, 0, triangleThreads, CommonKernelServices.WARP_GROUP_SIZE) queue.put1DRangeKernel(checkTriangles, 0, triangleThreads, CommonKernelServices.WARP_GROUP_SIZE) .putReadBuffer(counter, false) .finish(); Loading @@ -195,29 +192,16 @@ public class OctreeOpenCLImpl implements OctreeOpenCL { * Traverses tree structure generated from vertices 1 thread per triangle once again, this time stores the triangles correct leaves. */ private void assignTriangles(WriteBufferGPU<MeshTriangle> meshTriangleBuffer) { int bufferSize = leafCount * largestLeaf; System.out.println(leafCount + " * " + largestLeaf); if (bufferSize < 0) { System.err.println("Buffer size overflow"); return; } if (bufferSize == 0) { System.err.println("Buffer size is 0"); return; } leaves.resize(bufferSize); CommonKernelServices.initializeBuffer(clContext, leaves.get(), leaves.getCount(), -1); CommonKernelServices.initializeBuffer(clContext, triangleCounters.get(), triangleCounters.getCount(), 0); leaves.resize(leafCount * largestLeaf); CommonKernelServices.initializeBuffer(clContext, queue, leaves.get(), leaves.getCount(), -1); CommonKernelServices.initializeBuffer(clContext, queue, triangleCounters.get(), triangleCounters.getCount(), 0); CLKernel assignTriangles = octreeBuilderProgram.getKernel("assignTriangles") .putArg(meshTriangleBuffer.getCount()) .putArg(largestLeaf) .putArgs(octreeBBox, meshTriangleBuffer.get(), tree.get(), triangleCounters.get(), leaves.get()); queue.finish() .put1DRangeKernel(assignTriangles, 0, triangleThreads, CommonKernelServices.WARP_GROUP_SIZE) queue.put1DRangeKernel(assignTriangles, 0, triangleThreads, CommonKernelServices.WARP_GROUP_SIZE) .finish(); assignTriangles.release(); } Loading