www.xbdev.net
xbdev - software development
Friday June 27, 2025
Home | Contact | Support | WebGPU Graphics and Compute ... | WebGPU 'Compute'.. Compute, Algorithms, and Code.....
     
 

WebGPU 'Compute'..

Compute, Algorithms, and Code.....

 

Radix Sort Algorithm (WebGPU/WGSL)


Sorting a large array on the GPU using the compute shader. For this example, we use the 'radix' sort algorithm.


Radix sort algorithm output.
Radix sort algorithm output.


Functions Used: requestAdapter(), getPreferredCanvasFormat(), createCommandEncoder(), beginRenderPass(), setPipeline(), draw(), end(), submit(), getCurrentTexture(), createView(), createShaderModule()

Radix Sort is a non-comparative sorting algorithm that sorts integers by processing individual digits.

Unlike comparison-based algorithms like Quick Sort or Merge Sort, Radix Sort groups numbers by their digit values, starting from the least significant digit (LSD) to the most significant digit (MSD).

It uses a stable sorting subroutine, typically Counting Sort, to sort numbers at each digit level, ensuring that the relative order of numbers with the same digit value is preserved across iterations. This stability is crucial for maintaining the overall order when processing subsequent digits.

The algorithm begins by initializing a series of buckets or buffers to count the occurrences of each digit (0-9 for decimal numbers) at a given place value (units, tens, hundreds, etc.).

For each digit place, the algorithm iterates through the entire list of numbers, extracting the relevant digit, and updating the count in the corresponding bucket. After counting, a prefix sum or cumulative count is computed, transforming the count array into a position array that indicates where each digit should be placed in the output array.

In the final phase, the numbers are rearranged based on the prefix sum array to ensure they are sorted according to the current digit. This process is repeated for each digit place, progressively refining the order of the list.

At the end of the process, the numbers are sorted in ascending order.

Radix Sort's efficiency comes from its linear time complexity for fixed-width integers, making it particularly effective for sorting large datasets with uniformly distributed keys; its performance can be affected by the number of digits and the base used for digit extraction, as well as the additional memory required for the counting and prefix sum arrays.



Complete Code


The complete code for the WebGPU (JavaScript) and the WGSL compute shader.

const wgslcompute = `
struct Data {
    numbers: array<u32>
};

@group(0) @binding(0) var<storage, read_write> data: Data;
@group(0) @binding(1) var<storage, read_write> tempData: Data;
@group(0) @binding(2) var<storage, read_write> countBuffer: array<atomic<u32>>;
@group(0) @binding(3) var<storage, read_write> prefixSumBuffer: array<atomic<u32>>;


fn getDigit(value: u32, digit: u32, base: u32) -> u32 {
    var divisor: u32 = 1;
    for (var i: u32 = 0; i < digit; i++) {
        divisor *= base;
    }
    return (value / divisor) % base;
}

//fn getDigit(value: u32, digit: u32, base: u32) -> u32 {
//    return ( value / u32(pow(f32(base), f32(digit) ) ) ) % base;
//}


@compute @workgroup_size(8)
fn main(@builtin(global_invocation_id) globalId      : vec3<u32>,
        @builtin(local_invocation_id)  localId       : vec3<u32>,
        @builtin(workgroup_id)         workgroupId   : vec3<u32>,
        @builtin(num_workgroups)       numWorkgroups : vec3<u32>
        ) 
{
    let idx = globalId.x;
    let numElements = arrayLength(&data.numbers); // Custom implementation or predefined size
    let base: u32 = 10;
    let numDigits: u32 = 10;  // Assumes sorting integers

    for (var digit: u32 = 0; digit < numDigits; digit++) {
        // Step 1: Counting
        if (idx < base) {
            atomicStore(&countBuffer[idx], 0u);
        }
        workgroupBarrier();
        storageBarrier();

        for (var i: u32 = idx; i < numElements; i += numWorkgroups.x * 8) {
            let digitValue = getDigit(data.numbers[i], digit, base);
            atomicAdd(&countBuffer[digitValue], 1u);
        }
        workgroupBarrier();
        storageBarrier();

        // Step 2: Prefix Sum (exclusive scan)
        if (idx == 0) {
            var sum: u32 = 0;
            for (var i: u32 = 0; i < base; i++) {
                let count = atomicLoad(&countBuffer[i]);
                atomicStore(&prefixSumBuffer[i], sum);
                sum += count;
            }
        }
        workgroupBarrier();
        storageBarrier();

        // Step 3: Rearranging
        for (var i: u32 = idx; i < numElements; i += numWorkgroups.x * 8) {
            let digitValue = getDigit(data.numbers[i], digit, base);
            let pos = atomicAdd(&prefixSumBuffer[digitValue], 1u);
            tempData.numbers[pos] = data.numbers[i];
        }
        workgroupBarrier();
        storageBarrier();

        // Step 4: Copy back
        for (var i: u32 = idx; i < numElements; i += numWorkgroups.x * 1) {
            data.numbers[i] = tempData.numbers[i];
        }
        workgroupBarrier();
        storageBarrier();
    }
}


`;

let div document.createElement('div');
document.body.appendChilddiv );
div.style['font-size'] = '20pt';
function 
log)
{
  
console.log);
  
let args = [...arguments].join(' ');
  
div.innerHTML += args '<br>';
}

log('WebGPU Compute Example');

if (!
navigator.gpu) { log("WebGPU is not supported (or is it disabled? flags/settings)"); return; }

const 
adapter await navigator.gpu.requestAdapter();
const 
device  await adapter.requestDevice();


const 
arraySize 64;
const 
data0 = new Uint32Array(arraySize);
for (
let i 0arraySizei++) {
  
//data0[i] = arraySize - i; // Math.round( Math.random() * 100.0 );
  
data0[i] = Math.roundMath.random() * 100.0 );
}

const 
gbuffer0 device.createBuffer({ size:  data0.byteLengthusageGPUBufferUsage.STORAGE GPUBufferUsage.COPY_DST GPUBufferUsage.COPY_SRC });
device.queue.writeBuffer(gbuffer00data0);


const 
tempBuffer device.createBuffer({ sizedata0.byteLengthusageGPUBufferUsage.STORAGE });

const 
blank2 = new Uint32Array(arraySize);
for (
let i 0arraySizei++) {
  
blank2[i] = 0;
}
device.queue.writeBuffer(tempBuffer0blank2);

// 10 is for 10 digits
const countBuffer device.createBuffer({ size10 Uint32Array.BYTES_PER_ELEMENTusageGPUBufferUsage.STORAGE });

const 
prefixSumBuffer device.createBuffer({ size10 Uint32Array.BYTES_PER_ELEMENT,  usageGPUBufferUsage.STORAGE });

const 
blank = new Uint32Array10 );
for (
let i 010i++)
{
  
blank[i] = 0;
}
device.queue.writeBuffer(countBuffer0blank);
device.queue.writeBuffer(prefixSumBuffer0blank);

//const resultBuffer = device.createBuffer({ size: data0.byteLength, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST });

const readbackBuffer device.createBuffer({ sizedata0.byteLengthusageGPUBufferUsage.COPY_DST GPUBufferUsage.MAP_READ });


const 
bindGroupLayout device.createBindGroupLayout({
  
entries: [  { binding0visibilityGPUShaderStage.COMPUTEbuffer: { type'storage' }},
              { 
binding1visibilityGPUShaderStage.COMPUTEbuffer: { type'storage' }},
              { 
binding2visibilityGPUShaderStage.COMPUTEbuffer: { type'storage' }},
              { 
binding3visibilityGPUShaderStage.COMPUTEbuffer: { type'storage' }}
               
           ]
});

const 
bindGroup device.createBindGroup({
  
layoutbindGroupLayout,
  
entries: [{ binding0resource: { buffergbuffer0        }},
            { 
binding1resource: { buffertempBuffer      }},
            { 
binding2resource: { buffercountBuffer     }},
            { 
binding3resource: { bufferprefixSumBuffer }}
           ]
});

const 
shaderModule device.createShaderModule( {  codewgslcompute } );
  
const 
computePipeline device.createComputePipeline({
  
layoutdevice.createPipelineLayout({bindGroupLayouts: [bindGroupLayout]}),
  
compute: { moduleshaderModule,
             
entryPoint'main'  }
});


const 
commandEncoder device.createCommandEncoder();
const 
passEncoder commandEncoder.beginComputePass();
passEncoder.setPipeline(computePipeline);
passEncoder.setBindGroup(0bindGroup);
passEncoder.dispatchWorkgroups); // arraySize / 32  );
await passEncoder.end();

commandEncoder.copyBufferToBuffer(gbuffer00readbackBuffer0data0.byteLength);

const 
commands commandEncoder.finish();
await device.queue.submit([commands]);

await readbackBuffer.mapAsync(GPUMapMode.READ);
const 
sortedArray = new Uint32Array(readbackBuffer.getMappedRange());
console.log(sortedArray);

log('Sorted array:')
const 
arr = Array.fromsortedArray );
for (
let i=0i<arr.lengthi++)
{
      
// display over multiple lines
    
logarr.splice010 ) );  
}

log('Input array:')
const 
inarr = Array.fromdata0 );
for (
let i=0i<inarr.lengthi++)
{
      
// display over multiple lines
    
loginarr.splice010 ) );  
}





Resources and Links


• WebGPU Lab Radix Sort Demo [LINK]

• WebGPU Lab Bitronc Demo [LINK]















WebGPU by Example: Fractals, Image Effects, Ray-Tracing, Procedural Geometry, 2D/3D, Particles, Simulations WebGPU Compute graphics and animations using the webgpu api 12 week course kenwright learn webgpu api kenwright programming compute and graphics applications with html5 and webgpu api kenwright real-time 3d graphics with webgpu kenwright webgpu api develompent a quick start guide kenwright webgpu by example 2022 kenwright webgpu gems kenwright webgpu interactive compute and graphics visualization cookbook kenwright wgsl webgpu shading language cookbook kenwright wgsl webgpugems shading language cookbook kenwright



 
Advert (Support Website)

 
 Visitor:
Copyright (c) 2002-2025 xbdev.net - All rights reserved.
Designated articles, tutorials and software are the property of their respective owners.