'BFS Parallel OpenMP low speed up

The following parallelized version of the BFS algorithm code on a graph gave me a speed up of 1.5 compared to the sequential version considering a fully connected graph with 5000 nodes, on a dual core 4 threads processor. I was wondering if it was possible to modify this algorithm to improve parallelization. Furthermore, with the code shown below I cannot consider a graph of over 10000 nodes which is returned to me as a segmentation fault and I would like to know if it is possible to solve this problem to test the code on a larger graph. Below is the code for the bfs and the main.

MAIN:

#include "bfs.h"
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>

/**
 * Conditional compilation to calculate the time with omp_get_wtime() 
 * if you are using openMP or with the macros defined in the bfs.h if you are not using open MP. 
 */

#ifdef _OPENMP
    #include <omp.h>
#else
    #define omp_get_wtime() 0
#endif


int main(int argc, char const *argv[])
{

    double bfs_time = 0.0, initialization_time = 0.0;
    int n, threads;
    double start_time_init, end_time_init, start_time_bfs, end_time_bfs;

    if (argc < 3)
    {
        printf("ERROR! Usage: ./main dim threads\n");
        exit(1);
    }

    n = atoi(argv[1]);
    threads = atoi(argv[2]);

    
    STARTTIME(1);
    start_time_init = omp_get_wtime();
    struct node *nodes = (struct node*)malloc(n*sizeof(struct node));
        init_structures(n, nodes);
    ENDTIME(1, initialization_time);
        end_time_init = omp_get_wtime();



    STARTTIME(2);
        start_time_bfs = omp_get_wtime();
    bool *visited = (bool*)malloc(n);
    parallel_bfs(n, nodes, 0, visited, threads);
    ENDTIME(2, bfs_time);
    end_time_bfs = omp_get_wtime();
    
    

    
    if (threads != 0){
        initialization_time = end_time_init - start_time_init;
        bfs_time = end_time_bfs - start_time_bfs;
    }

    printf("%d;%d;%f;%f\n", n, threads, initialization_time, bfs_time);
    free(visited);
    free(nodes);
    return 0;
    
}

BFS:

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdint.h>
#include <time.h>
#include "bfs.h"


/**
 * Conditional compilation to run the code with threads if you are using openMP 
 * or with 0 thread if you are not using open MP. 
 */

#ifdef _OPENMP
    #include <omp.h>
#else
    #define get_thread_num() 0
#endif


void init_structures(int n, struct node *nodes) {

    
    for (int i=0; i<n; i++)
    {
        nodes[i].num_neighbours=n-1;
        nodes[i].next = (int*)malloc(nodes[i].num_neighbours*sizeof(int));
        for (int j=0; j<nodes[i].num_neighbours; j++)
        {   
            if (j!=i) { 
            nodes[i].next[j]=j; }
        }
    }
    
    
}



int visit_check(int n, bool *visited) // funzione per verificare quale nodo è stato visitato oppure no
{
    for (int i=0; i<n; i++)
    {
        if (visited[i]==false)
        {
            return i;
        }
    }
    return -1;
}

void parallel_bfs(int n, struct node *nodes, int start, bool *visited, int threads)
{
    
    int *level = (int*)malloc(n*sizeof(int)); 
    visited[start] = true; 
    level[start] = 0;
    int *q[2];  
    q[0] = (int*)malloc(n*sizeof(int)); 
    q[1] = (int*)malloc(n*sizeof(int)); 
    q[0][0] = start;  
    int len[2]; 
    len[0] = 1; 
    int next_queue, current_queue = 0;
    int current_node; 
    while(1)
    {
        current_node = q[current_queue][0];  

        next_queue = (current_queue + 1)%2; 
        len[next_queue] = 0; 
        #pragma omp parallel for private(current_node) num_threads(threads) 
        for (int i=0; i<len[current_queue]; i++) 
        {
            current_node = q[current_queue][i]; 
            for (int j=0; j<nodes[current_node].num_neighbours; j++)
            {
                if (!visited[nodes[current_node].next[j]]) 
                {
                    visited[nodes[current_node].next[j]] = true;  
                    level[nodes[current_node].next[j]] = level[current_node]+1; 
                    #pragma omp critical 
                    q[next_queue][len[next_queue]++] = nodes[current_node].next[j];
                }
            }
        }
        if (len[next_queue]==0) 
            break;
        else
            current_queue = next_queue; 
    }
    free(level);
    free(q[0]);
    free(q[1]);
}

Thanks a lot for your help.



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source