'Clickstream performance Scoring formula using SQL

imagine we have a clickstream and we want to have some everyday score that follows next rules:

  1. increase by 1 each impression
  2. increase by 10 each click
  3. decrease by 1 every second day without impressions (assume there cannot be click without impression)
  4. score cannot go bellow zero

First day score is 0 and every next day using the rules from above we're changing it. Every next day the score is based on prior day.

(each user should have own score)

The main question how to build such scoring method using SQL.

Thank you.

user_id date impressions clicks score why?
124 2021-12-13 1 0 1 1 imp, so +1 to our score (initially it's 0)
124 2021-12-14 0 0 1 1st day without imp, so leave as is
124 2021-12-15 0 0 0 2nd day, so decrease by 1
124 2021-12-16 0 0 0 3rd day
124 2021-12-17 0 0 0 4th day - we cannot go bellow 0 so cannot decrease by 1
124 2021-12-18 0 0 0 ...
124 2021-12-19 0 0 0
124 2021-12-20 0 0 0
124 2021-12-21 0 0 0
124 2021-12-22 0 0 0
124 2021-12-23 1 0 1 1 impr, so +1 to our score
124 2021-12-24 0 0 1 1st day without imp, so leave as is
124 2021-12-25 1 0 2 +1 more imp
124 2021-12-26 0 0 2 1st day without imp, so leave as is
124 2021-12-27 0 0 1 2nd day without imp, so decrease by 1
124 2021-12-28 1 1 12 10(clicks weight) + 1(imp weight) + 1(from prior day)

UPD1:

WITH subset as (
        select 124 as user_id, '2021-12-13' as date, 1 as impressions, 0 clicks
        union all select 124, '2021-12-14', 0, 0
        union all select 124, '2021-12-15', 0, 0
        union all select 124, '2021-12-16', 0, 0
        union all select 124, '2021-12-17', 0, 0
        union all select 124, '2021-12-18', 0, 0
        union all select 124, '2021-12-19', 0, 0
        union all select 124, '2021-12-20', 0, 0
        union all select 124, '2021-12-21', 0, 0
        union all select 124, '2021-12-22', 0, 0
        union all select 124, '2021-12-23', 1, 0
        union all select 124, '2021-12-24', 0, 0
        union all select 124, '2021-12-25', 1, 0
        union all select 124, '2021-12-26', 0, 0
        union all select 124, '2021-12-27', 0, 0
        union all select 124, '2021-12-28', 1, 1
        union all select 124, '2021-12-29', 0, 0
        union all select 124, '2021-12-30', 0, 0
        union all select 124, '2021-12-31', 0, 0
        union all select 124, '2022-01-01', 0, 0
        union all select 124, '2022-01-02', 0, 0
        union all select 124, '2022-01-03', 0, 0
        union all select 124, '2022-01-04', 0, 0
        union all select 124, '2022-01-05', 0, 0
        union all select 124, '2022-01-06', 0, 0
        union all select 124, '2022-01-07', 0, 0
        union all select 124, '2022-01-08', 0, 0
        union all select 124, '2022-01-09', 0, 0
        union all select 124, '2022-01-10', 0, 0
        union all select 124, '2022-01-11', 0, 0
        union all select 124, '2022-01-12', 0, 0
),

impression_window_tbl AS
 (SELECT *,
         sum(impressions) OVER (PARTITION BY user_id
             ORDER BY date) AS impressions_window
  FROM subset),

no_impressions_window AS
 (SELECT user_id,
         date,
         10                                                  AS click_cost,
         1                                                   AS impression_cost,
         2                                                   AS degradation_duration, -- every 2 days
         1                                                   AS degradation_cost,
         impressions_window,
         sum(impressions = 0) OVER (PARTITION BY user_id,
             impressions_window ORDER BY date)               AS row_num_with_no_impressions,
         clicks * click_cost + impressions * impression_cost AS daily_score,          -- ideally only 1 click and 1 impression per day per user
         clicks,
         impressions
  FROM impression_window_tbl),
degradation AS
 (SELECT *,                                                                                            -- this is only specific day degradation value.
         if((modulo(row_num_with_no_impressions, degradation_duration) = 0) = 1
                AND row_num_with_no_impressions > 0, -degradation_cost,
            0)                                                                   AS degradation_value, -- but we have to take into account the degradation_value
         -- but also there is possible cases where no impressions
         sum(daily_score) OVER (PARTITION BY user_id
             ORDER BY date ASC ROWS BETWEEN UNBOUNDED preceding AND CURRENT ROW) AS inc_cumsum         -- cumulative sum of daily_score

  FROM no_impressions_window)
SELECT *
FROM degradation;

Above is a code sample I'm using to play around.

It returns next:

user_id date click_cost impression_cost degradation_duration degradation_cost impressions_window row_num_with_no_impressions daily_score clicks impressions degradation_value inc_cumsum
124 2021-12-13 10 1 2 1 1 0 1 0 1 0 1
124 2021-12-14 10 1 2 1 1 1 0 0 0 0 1
124 2021-12-15 10 1 2 1 1 2 0 0 0 -1 1
124 2021-12-16 10 1 2 1 1 3 0 0 0 0 1
124 2021-12-17 10 1 2 1 1 4 0 0 0 -1 1
124 2021-12-18 10 1 2 1 1 5 0 0 0 0 1
124 2021-12-19 10 1 2 1 1 6 0 0 0 -1 1
124 2021-12-20 10 1 2 1 1 7 0 0 0 0 1
124 2021-12-21 10 1 2 1 1 8 0 0 0 -1 1
124 2021-12-22 10 1 2 1 1 9 0 0 0 0 1
124 2021-12-23 10 1 2 1 2 0 1 0 1 0 2
124 2021-12-24 10 1 2 1 2 1 0 0 0 0 2
124 2021-12-25 10 1 2 1 3 0 1 0 1 0 3
124 2021-12-26 10 1 2 1 3 1 0 0 0 0 3
124 2021-12-27 10 1 2 1 3 2 0 0 0 -1 3
124 2021-12-28 10 1 2 1 4 0 11 1 1 0 14
124 2021-12-29 10 1 2 1 4 1 0 0 0 0 14
124 2021-12-30 10 1 2 1 4 2 0 0 0 -1 14
124 2021-12-31 10 1 2 1 4 3 0 0 0 0 14
124 2022-01-01 10 1 2 1 4 4 0 0 0 -1 14
124 2022-01-02 10 1 2 1 4 5 0 0 0 0 14
124 2022-01-03 10 1 2 1 4 6 0 0 0 -1 14
124 2022-01-04 10 1 2 1 4 7 0 0 0 0 14
124 2022-01-05 10 1 2 1 4 8 0 0 0 -1 14
124 2022-01-06 10 1 2 1 4 9 0 0 0 0 14
124 2022-01-07 10 1 2 1 4 10 0 0 0 -1 14
124 2022-01-08 10 1 2 1 4 11 0 0 0 0 14
124 2022-01-09 10 1 2 1 4 12 0 0 0 -1 14
124 2022-01-10 10 1 2 1 4 13 0 0 0 0 14
124 2022-01-11 10 1 2 1 4 14 0 0 0 -1 14
124 2022-01-12 10 1 2 1 4 15 0 0 0 0 14

Degradation value as well as daily_score is for specific day only, it's not cumulative.

inc_cumsum is just an example, ideally we have to do same for degradation_value and sum them, but! we should take into account not going below zero level on daily basis.


UPD2: Solution from @Mikhail does not cover all cases. For data like this one:

WITH subset as (
        select 124 as user_id, toDate('2021-12-13') as date, 1 as impressions, 0 clicks
        union all select 124, toDate('2021-12-14'),  1, 0
        union all select 124, toDate('2021-12-15'),  1, 0
        union all select 124, toDate('2021-12-16'),  1, 0
        union all select 124, toDate('2021-12-17'),  0, 0
        union all select 124, toDate('2021-12-18'),  0, 0
        union all select 124, toDate('2021-12-19'),  0, 0
        union all select 124, toDate('2021-12-20'),  0, 0
        union all select 124, toDate('2021-12-21'),  0, 0
        union all select 124, toDate('2021-12-22'),  0, 0
        union all select 124, toDate('2021-12-23'),  0, 0
        union all select 124, toDate('2021-12-24'),  0, 0
        union all select 124, toDate('2021-12-25'),  0, 0
        union all select 124, toDate('2021-12-26'),  0, 0
        union all select 124, toDate('2021-12-27'),  0, 0
        union all select 124, toDate('2021-12-28'),  0, 0
        union all select 124, toDate('2021-12-29'),  0, 0
        union all select 124, toDate('2021-12-30'),  0, 0
        union all select 124, toDate('2021-12-31'),  0, 0
        union all select 124, toDate('2022-01-01'),  0, 0
        union all select 124, toDate('2022-01-02'),  0, 0
        union all select 124, toDate('2022-01-03'),  0, 0
        union all select 124, toDate('2022-01-04'),  0, 0
        union all select 124, toDate('2022-01-05'),  0, 0
        union all select 124, toDate('2022-01-06'),  8, 0
        union all select 124, toDate('2022-01-07'),  4, 0
        union all select 124, toDate('2022-01-08'),  0, 0
        union all select 124, toDate('2022-01-09'),  0, 0
        union all select 124, toDate('2022-01-10'),  0, 0
        union all select 124, toDate('2022-01-11'),  0, 0
        union all select 124, toDate('2022-01-12'),  0, 0
        union all select 124, toDate('2022-01-13'),  0, 0
        union all select 124, toDate('2022-01-14'),  0, 0
        union all select 124, toDate('2022-01-15'),  0, 0
        union all select 124, toDate('2022-01-16'),  0, 0
        union all select 124, toDate('2022-01-17'),  0, 0
        union all select 124, toDate('2022-01-18'),  0, 0
        union all select 124, toDate('2022-01-19'),  0, 0
        union all select 124, toDate('2022-01-20'),  0, 0
        union all select 124, toDate('2022-01-21'),  0, 0
        union all select 124, toDate('2022-01-22'),  0, 0
        union all select 124, toDate('2022-01-23'),  0, 0
        union all select 124, toDate('2022-01-24'),  0, 0
        union all select 124, toDate('2022-01-25'),  0, 0
        union all select 124, toDate('2022-01-26'),  8, 0
        union all select 124, toDate('2022-01-27'),  0, 0
        union all select 124, toDate('2022-01-28'),  0, 0
        union all select 124, toDate('2022-01-29'),  0, 0
),


Solution 1:[1]

Consider below approach

with temp as (
  select *, row_number() over(partition by user_id, grp order by date) pos,
    if(impressions > 0, impressions + 10 * clicks, 0) + 
    if(impressions = 0, mod(row_number() over(partition by user_id, grp order by date), 2) - 1, 0) as total
  from (
    select *, countif(new_grp) over(partition by user_id order by date) grp
    from (
      select *, 
        ((0 = lag(impressions) over(partition by user_id order by date)) and impressions > 0) or 
        ((0 < lag(impressions) over(partition by user_id order by date)) and impressions = 0)  as new_grp
      from subset
    )
  )
), final as (
  select distinct a.*, ifnull(adjustment, 0) adjustment 
  from temp a
  left join(
    select user_id, grp + 1 as grp, -1 - adjustment as adjustment
    from (
          select user_id, grp, sum(total) adjustment
          from temp
          group by user_id, grp
          having adjustment < 0 
      )
  ) b
  on a.user_id = b.user_id 
  and a.grp = b.grp 
  and a.pos = 1
)
select user_id, date, impressions, clicks, if(score < 0, 0, score) as score
from (
  select *, sum(total + if(pos = 1, adjustment, 0)) over(partition by user_id order by date) score
  from final
)            

if applied to sample data in y our question - output is

enter image description here

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Mikhail Berlyant