Friday June 27, 2025

Home | Contact | Support | WebGPU Graphics and Compute ... | WebGPU 'Compute'.. Compute, Algorithms, and Code.....

WebGPU 'Compute'..

Compute, Algorithms, and Code.....

Radix Sort Algorithm (WebGPU/WGSL)

Sorting a large array on the GPU using the compute shader. For this example, we use the 'radix' sort algorithm.

Radix sort algorithm output.

Functions Used: requestAdapter(), getPreferredCanvasFormat(), createCommandEncoder(), beginRenderPass(), setPipeline(), draw(), end(), submit(), getCurrentTexture(), createView(), createShaderModule()

Radix Sort is a non-comparative sorting algorithm that sorts integers by processing individual digits.

Unlike comparison-based algorithms like Quick Sort or Merge Sort, Radix Sort groups numbers by their digit values, starting from the least significant digit (LSD) to the most significant digit (MSD).

It uses a stable sorting subroutine, typically Counting Sort, to sort numbers at each digit level, ensuring that the relative order of numbers with the same digit value is preserved across iterations. This stability is crucial for maintaining the overall order when processing subsequent digits.

The algorithm begins by initializing a series of buckets or buffers to count the occurrences of each digit (0-9 for decimal numbers) at a given place value (units, tens, hundreds, etc.).

For each digit place, the algorithm iterates through the entire list of numbers, extracting the relevant digit, and updating the count in the corresponding bucket. After counting, a prefix sum or cumulative count is computed, transforming the count array into a position array that indicates where each digit should be placed in the output array.

In the final phase, the numbers are rearranged based on the prefix sum array to ensure they are sorted according to the current digit. This process is repeated for each digit place, progressively refining the order of the list.

At the end of the process, the numbers are sorted in ascending order.

Radix Sort's efficiency comes from its linear time complexity for fixed-width integers, making it particularly effective for sorting large datasets with uniformly distributed keys; its performance can be affected by the number of digits and the base used for digit extraction, as well as the additional memory required for the counting and prefix sum arrays.

Complete Code

The complete code for the WebGPU (JavaScript) and the WGSL compute shader.

<?php
const wgslcompute = `
struct Data {
    numbers: array<u32>
};

@group(0) @binding(0) var<storage, read_write> data: Data;
@group(0) @binding(1) var<storage, read_write> tempData: Data;
@group(0) @binding(2) var<storage, read_write> countBuffer: array<atomic<u32>>;
@group(0) @binding(3) var<storage, read_write> prefixSumBuffer: array<atomic<u32>>;


fn getDigit(value: u32, digit: u32, base: u32) -> u32 {
    var divisor: u32 = 1;
    for (var i: u32 = 0; i < digit; i++) {
        divisor *= base;
    }
    return (value / divisor) % base;
}

//fn getDigit(value: u32, digit: u32, base: u32) -> u32 {
//    return ( value / u32(pow(f32(base), f32(digit) ) ) ) % base;
//}


@compute @workgroup_size(8)
fn main(@builtin(global_invocation_id) globalId      : vec3<u32>,
		@builtin(local_invocation_id)  localId       : vec3<u32>,
		@builtin(workgroup_id)         workgroupId   : vec3<u32>,
        @builtin(num_workgroups)	   numWorkgroups : vec3<u32>
        ) 
{
    let idx = globalId.x;
    let numElements = arrayLength(&data.numbers); // Custom implementation or predefined size
    let base: u32 = 10;
    let numDigits: u32 = 10;  // Assumes sorting integers

    for (var digit: u32 = 0; digit < numDigits; digit++) {
        // Step 1: Counting
        if (idx < base) {
            atomicStore(&countBuffer[idx], 0u);
        }
        workgroupBarrier();
        storageBarrier();

        for (var i: u32 = idx; i < numElements; i += numWorkgroups.x * 8) {
            let digitValue = getDigit(data.numbers[i], digit, base);
            atomicAdd(&countBuffer[digitValue], 1u);
        }
        workgroupBarrier();
        storageBarrier();

        // Step 2: Prefix Sum (exclusive scan)
        if (idx == 0) {
            var sum: u32 = 0;
            for (var i: u32 = 0; i < base; i++) {
                let count = atomicLoad(&countBuffer[i]);
                atomicStore(&prefixSumBuffer[i], sum);
                sum += count;
            }
        }
        workgroupBarrier();
        storageBarrier();

        // Step 3: Rearranging
        for (var i: u32 = idx; i < numElements; i += numWorkgroups.x * 8) {
            let digitValue = getDigit(data.numbers[i], digit, base);
            let pos = atomicAdd(&prefixSumBuffer[digitValue], 1u);
            tempData.numbers[pos] = data.numbers[i];
        }
        workgroupBarrier();
        storageBarrier();

        // Step 4: Copy back
        for (var i: u32 = idx; i < numElements; i += numWorkgroups.x * 1) {
            data.numbers[i] = tempData.numbers[i];
        }
        workgroupBarrier();
        storageBarrier();
    }
}


`;

let div = document.createElement('div');
document.body.appendChild( div );
div.style['font-size'] = '20pt';
function log( s )
{
  console.log( s );
  let args = [...arguments].join(' ');
  div.innerHTML += args + '<br>';
}

log('WebGPU Compute Example');

if (!navigator.gpu) { log("WebGPU is not supported (or is it disabled? flags/settings)"); return; }

const adapter = await navigator.gpu.requestAdapter();
const device  = await adapter.requestDevice();


const arraySize = 64;
const data0 = new Uint32Array(arraySize);
for (let i = 0; i < arraySize; i++) {
  //data0[i] = arraySize - i; // Math.round( Math.random() * 100.0 );
  data0[i] = Math.round( Math.random() * 100.0 );
}

const gbuffer0 = device.createBuffer({ size:  data0.byteLength, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC });
device.queue.writeBuffer(gbuffer0, 0, data0);


const tempBuffer = device.createBuffer({ size: data0.byteLength, usage: GPUBufferUsage.STORAGE });

const blank2 = new Uint32Array(arraySize);
for (let i = 0; i < arraySize; i++) {
  blank2[i] = 0;
}
device.queue.writeBuffer(tempBuffer, 0, blank2);

// 10 is for 10 digits
const countBuffer = device.createBuffer({ size: 10 * Uint32Array.BYTES_PER_ELEMENT, usage: GPUBufferUsage.STORAGE });

const prefixSumBuffer = device.createBuffer({ size: 10 * Uint32Array.BYTES_PER_ELEMENT,  usage: GPUBufferUsage.STORAGE });

const blank = new Uint32Array( 10 );
for (let i = 0; i < 10; i++)
{
  blank[i] = 0;
}
device.queue.writeBuffer(countBuffer, 0, blank);
device.queue.writeBuffer(prefixSumBuffer, 0, blank);

//const resultBuffer = device.createBuffer({ size: data0.byteLength, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST });

const readbackBuffer = device.createBuffer({ size: data0.byteLength, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ });


const bindGroupLayout = device.createBindGroupLayout({
  entries: [  { binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' }},
              { binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' }},
              { binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' }},
              { binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' }}
               
           ]
});

const bindGroup = device.createBindGroup({
  layout: bindGroupLayout,
  entries: [{ binding: 0, resource: { buffer: gbuffer0        }},
            { binding: 1, resource: { buffer: tempBuffer      }},
            { binding: 2, resource: { buffer: countBuffer     }},
            { binding: 3, resource: { buffer: prefixSumBuffer }}
           ]
});

const shaderModule = device.createShaderModule( {  code: wgslcompute } );
  
const computePipeline = device.createComputePipeline({
  layout: device.createPipelineLayout({bindGroupLayouts: [bindGroupLayout]}),
  compute: { module: shaderModule,
             entryPoint: 'main'  }
});


const commandEncoder = device.createCommandEncoder();
const passEncoder = commandEncoder.beginComputePass();
passEncoder.setPipeline(computePipeline);
passEncoder.setBindGroup(0, bindGroup);
passEncoder.dispatchWorkgroups( 1 ); // arraySize / 32  );
await passEncoder.end();

commandEncoder.copyBufferToBuffer(gbuffer0, 0, readbackBuffer, 0, data0.byteLength);

const commands = commandEncoder.finish();
await device.queue.submit([commands]);

await readbackBuffer.mapAsync(GPUMapMode.READ);
const sortedArray = new Uint32Array(readbackBuffer.getMappedRange());
console.log(sortedArray);

log('Sorted array:')
const arr = Array.from( sortedArray );
for (let i=0; i<arr.length; i++)
{
  	// display over multiple lines
	log( arr.splice( 0, 10 ) );  
}

log('Input array:')
const inarr = Array.from( data0 );
for (let i=0; i<inarr.length; i++)
{
  	// display over multiple lines
	log( inarr.splice( 0, 10 ) );  
}

const wgslcompute = `
struct Data {
    numbers: array<u32>
};

@group(0) @binding(0) var<storage, read_write> data: Data;
@group(0) @binding(1) var<storage, read_write> tempData: Data;
@group(0) @binding(2) var<storage, read_write> countBuffer: array<atomic<u32>>;
@group(0) @binding(3) var<storage, read_write> prefixSumBuffer: array<atomic<u32>>;

fn getDigit(value: u32, digit: u32, base: u32) -> u32 {
    var divisor: u32 = 1;
    for (var i: u32 = 0; i < digit; i++) {
        divisor *= base;
    }
    return (value / divisor) % base;
}

//fn getDigit(value: u32, digit: u32, base: u32) -> u32 {
//    return ( value / u32(pow(f32(base), f32(digit) ) ) ) % base;
//}

@compute @workgroup_size(8)
fn main(@builtin(global_invocation_id) globalId      : vec3<u32>,
        @builtin(local_invocation_id)  localId       : vec3<u32>,
        @builtin(workgroup_id)         workgroupId   : vec3<u32>,
        @builtin(num_workgroups)       numWorkgroups : vec3<u32>
        )
{
    let idx = globalId.x;
    let numElements = arrayLength(&data.numbers); // Custom implementation or predefined size
    let base: u32 = 10;
    let numDigits: u32 = 10;  // Assumes sorting integers

    for (var digit: u32 = 0; digit < numDigits; digit++) {
        // Step 1: Counting
        if (idx < base) {
            atomicStore(&countBuffer[idx], 0u);
        }
        workgroupBarrier();
        storageBarrier();

        for (var i: u32 = idx; i < numElements; i += numWorkgroups.x * 8) {
            let digitValue = getDigit(data.numbers[i], digit, base);
            atomicAdd(&countBuffer[digitValue], 1u);
        }
        workgroupBarrier();
        storageBarrier();

        // Step 2: Prefix Sum (exclusive scan)
        if (idx == 0) {
            var sum: u32 = 0;
            for (var i: u32 = 0; i < base; i++) {
                let count = atomicLoad(&countBuffer[i]);
                atomicStore(&prefixSumBuffer[i], sum);
                sum += count;
            }
        }
        workgroupBarrier();
        storageBarrier();

        // Step 3: Rearranging
        for (var i: u32 = idx; i < numElements; i += numWorkgroups.x * 8) {
            let digitValue = getDigit(data.numbers[i], digit, base);
            let pos = atomicAdd(&prefixSumBuffer[digitValue], 1u);
            tempData.numbers[pos] = data.numbers[i];
        }
        workgroupBarrier();
        storageBarrier();

        // Step 4: Copy back
        for (var i: u32 = idx; i < numElements; i += numWorkgroups.x * 1) {
            data.numbers[i] = tempData.numbers[i];
        }
        workgroupBarrier();
        storageBarrier();
    }
}

`;

let div = document.createElement('div');
document.body.appendChild( div );
div.style['font-size'] = '20pt';
function log( s )
{
  console.log( s );
  let args = [...arguments].join(' ');
  div.innerHTML += args + '<br>';
}

log('WebGPU Compute Example');

if (!navigator.gpu) { log("WebGPU is not supported (or is it disabled? flags/settings)"); return; }

const adapter = await navigator.gpu.requestAdapter();
const device  = await adapter.requestDevice();

const arraySize = 64;
const data0 = new Uint32Array(arraySize);
for (let i = 0; i < arraySize; i++) {
  //data0[i] = arraySize - i; // Math.round( Math.random() * 100.0 );
  data0[i] = Math.round( Math.random() * 100.0 );
}

const gbuffer0 = device.createBuffer({ size:  data0.byteLength, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC });
device.queue.writeBuffer(gbuffer0, 0, data0);

const tempBuffer = device.createBuffer({ size: data0.byteLength, usage: GPUBufferUsage.STORAGE });

const blank2 = new Uint32Array(arraySize);
for (let i = 0; i < arraySize; i++) {
  blank2[i] = 0;
}
device.queue.writeBuffer(tempBuffer, 0, blank2);

// 10 is for 10 digits
const countBuffer = device.createBuffer({ size: 10 * Uint32Array.BYTES_PER_ELEMENT, usage: GPUBufferUsage.STORAGE });

const prefixSumBuffer = device.createBuffer({ size: 10 * Uint32Array.BYTES_PER_ELEMENT,  usage: GPUBufferUsage.STORAGE });

const blank = new Uint32Array( 10 );
for (let i = 0; i < 10; i++)
{
  blank[i] = 0;
}
device.queue.writeBuffer(countBuffer, 0, blank);
device.queue.writeBuffer(prefixSumBuffer, 0, blank);

//const resultBuffer = device.createBuffer({ size: data0.byteLength, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST });

const readbackBuffer = device.createBuffer({ size: data0.byteLength, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ });

const bindGroupLayout = device.createBindGroupLayout({
  entries: [  { binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' }},
              { binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' }},
              { binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' }},
              { binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' }}

           ]
});

const bindGroup = device.createBindGroup({
  layout: bindGroupLayout,
  entries: [{ binding: 0, resource: { buffer: gbuffer0        }},
            { binding: 1, resource: { buffer: tempBuffer      }},
            { binding: 2, resource: { buffer: countBuffer     }},
            { binding: 3, resource: { buffer: prefixSumBuffer }}
           ]
});

const shaderModule = device.createShaderModule( {  code: wgslcompute } );

const computePipeline = device.createComputePipeline({
  layout: device.createPipelineLayout({bindGroupLayouts: [bindGroupLayout]}),
  compute: { module: shaderModule,
             entryPoint: 'main'  }
});

const commandEncoder = device.createCommandEncoder();
const passEncoder = commandEncoder.beginComputePass();
passEncoder.setPipeline(computePipeline);
passEncoder.setBindGroup(0, bindGroup);
passEncoder.dispatchWorkgroups( 1 ); // arraySize / 32  );
await passEncoder.end();

commandEncoder.copyBufferToBuffer(gbuffer0, 0, readbackBuffer, 0, data0.byteLength);

const commands = commandEncoder.finish();
await device.queue.submit([commands]);

await readbackBuffer.mapAsync(GPUMapMode.READ);
const sortedArray = new Uint32Array(readbackBuffer.getMappedRange());
console.log(sortedArray);

log('Sorted array:')
const arr = Array.from( sortedArray );
for (let i=0; i<arr.length; i++)
{
      // display over multiple lines
    log( arr.splice( 0, 10 ) );
}

log('Input array:')
const inarr = Array.from( data0 );
for (let i=0; i<inarr.length; i++)
{
      // display over multiple lines
    log( inarr.splice( 0, 10 ) );
}

Resources and Links

• WebGPU Lab Radix Sort Demo [LINK]

• WebGPU Lab Bitronc Demo [LINK]

Advert (Support Website)

Visitor:

Copyright (c) 2002-2025 xbdev.net - All rights reserved.
Designated articles, tutorials and software are the property of their respective owners.