'Using multi streams in cuda graph, the execution order is uncontrolled

I am using cuda graph stream capture API to implement a small demo with multi streams. Referenced by the CUDA Programming Guide here, I wrote the complete code. In my knowledge, kernelB should execute on stream1, but with nsys I found kernelB is executed on a complete new stream. It is under-control. The scheduling graph is showed below:

enter image description here

Here is my code:

#include <iostream>

__global__ void kernelA() {}
__global__ void kernelB() {}
__global__ void kernelC() {}

int main() {
  cudaStream_t stream1, stream2;
  cudaStreamCreate(&stream1);
  cudaStreamCreate(&stream2);

  cudaGraphExec_t graphExec = NULL;
  cudaEvent_t event1, event2;
  cudaEventCreate(&event1);
  cudaEventCreate(&event2);

  for (int i = 0; i < 10; i++) {
    cudaGraph_t graph;
    cudaGraphExecUpdateResult updateResult;
    cudaGraphNode_t errorNode;
    cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal);
    kernelA<<<512, 512, 0, stream1>>>();
    cudaEventRecord(event1, stream1);
    cudaStreamWaitEvent(stream2, event1, 0);
    kernelB<<<256, 512, 0, stream1>>>();
    kernelC<<<16, 512, 0, stream2>>>();
    cudaEventRecord(event2, stream2);
    cudaStreamWaitEvent(stream1, event2, 0);
    cudaStreamEndCapture(stream1, &graph);
    if (graphExec != NULL) {
      cudaGraphExecUpdate(graphExec, graph, &errorNode, &updateResult);
    }
    if (graphExec == NULL || updateResult != cudaGraphExecUpdateSuccess) {
      if (graphExec != NULL) {
        cudaGraphExecDestroy(graphExec);
      }
      cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);
    }
    cudaGraphDestroy(graph);
    cudaGraphLaunch(graphExec, stream1);
    cudaStreamSynchronize(stream1);
  }
}


Solution 1:[1]

"An operation may be scheduled at any time once the nodes on which it depends are complete. Scheduling is left up to the CUDA system." Here.

Solution 2:[2]

I also ask in Nvidia Forums, Robert answered this question which help me a lot. Someone who are interested in the scheduling of cuda graph can also reference to this answer here.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 bob
Solution 2 poohRui