'Performance issue running mongodb aggregation

I need to run a query that joins documents from two collections, I wrote an aggregation query but it takes too much time when running in the production database with many documents. Is there any way to write this query in a more efficient way?

Query in Mongo playground: https://mongoplayground.net/p/dLb3hsJHNYt

There are two collections users and activities. I need to run a query to get some users (from users collection), and also their last activity (from activities collection).

Database:

db={
    "users": [
      {
        "_id": 1,
        "email": "[email protected]",
        "username": "user1",
        "country": "BR",
        "creation_date": 1646873628
      },
      {
        "_id": 2,
        "email": "[email protected]",
        "username": "user2",
        "country": "US",
        "creation_date": 1646006402
      }
    ],
    "activities": [
      {
        "_id": 1,
        "email": "[email protected]",
        "activity": "like",
        "timestamp": 1647564787
      },
      {
        "_id": 2,
        "email": "[email protected]",
        "activity": "comment",
        "timestamp": 1647564834
      },
      {
        "_id": 3,
        "email": "[email protected]",
        "activity": "like",
        "timestamp": 1647564831
      }
    ]
}

Inefficient Query:

db.users.aggregate([
    {
      // Get users using some filters
      "$match": {
        "$expr": {
          "$and": [
            { "$not": { "$in": [ "$country", [ "AR", "CA" ] ] } },
            { "$gte": [ "$creation_date", 1646006400 ] },
            { "$lte": [ "$creation_date", 1648684800 ] }
          ]
        }
      }
    },
    {
      // Get the last activity within the time range
      "$lookup": {
        "from": "activities",
        "as": "last_activity",
        "let": { "cur_email": "$email" },
        "pipeline": [
          {
            "$match": {
              "$expr": {
                 "$and": [
                    { "$eq": [ "$email", "$$cur_email" ] },
                    { "$gte": [ "$timestamp", 1647564787 ] },
                    { "$lte": [ "$timestamp", 1647564834 ] }
                 ]
              }
            }
          },
          { "$sort": { "timestamp": -1 } },
          { "$limit": 1 }
        ]
      }
    },
    {
      // Remove users with no activity
      "$match": {
        "$expr": {
          "$gt": [ { "$size": "$last_activity" }, 0 ] }
        }
    }
])

Result:

[
    {
        "_id": 1,
        "country": "BR",
        "creation_date": 1.646873628e+09,
        "email": "[email protected]",
        "last_activity": [
        {
            "_id": 2,
            "activity": "comment",
            "email": "[email protected]",
            "timestamp": 1.647564788e+09
        }
        ],
        "username": "user1"
    },
    {
        "_id": 2,
        "country": "US",
        "creation_date": 1.646006402e+09,
        "email": "[email protected]",
        "last_activity": [
        {
            "_id": 3,
            "activity": "like",
            "email": "[email protected]",
            "timestamp": 1.647564831e+09
        }
        ],
        "username": "user2"
    }
]

I'm more familiar with relational databases, so I'm struggling a little to run this query efficiently.

Thanks!



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source