'Pyspark AssertionError: on should be Column or list of Column

Hi I have the below dataframes and when I join them I get AssertionError: on should be Column or list of Column. How do I get around this please as I cannot find any solution on google related to this?

   Pages = sc.read.json("/Users/me/desktop/clickstream/Clicks/Page*.json.gz")

   Pages_Dataset = Pages.select("SessionNumber", "PageLocation", "PageInstanceID")\
        .withColumnRenamed("PageLocation", "URL")\
        .withColumnRenamed("PageInstanceID", "CsaNumber")\
        .withColumn("URL2", expr("CASE WHEN INSTR(URL, '=') > 0 THEN SUBSTR(URL,0,INSTR(URL, '=') -1) ELSE URL END"))\
        .withColumn("URL2", expr("CASE WHEN INSTR(URL2, '?') > 0 THEN SUBSTR(URL2,0,INSTR(URL2, '?') -1) ELSE URL2 END"))\
        .withColumn("URL2", expr("CASE WHEN INSTR(URL2, '#') > 0 THEN SUBSTR(URL2,0,INSTR(URL2, '#') -1) ELSE URL2 END"))\
        .withColumn("URL3", expr("CASE WHEN INSTR(URL, 'prdcls=') > 0 THEN SUBSTR(URL,INSTR(URL, 'prdcls=')+7,2) ELSE '' END"))\
        .withColumn("URL", concat("URL2", "URL3"))\
        .select("SessionNumber", "URL", "CsaNumber").alias("a")\
        .join(ConfiguredUrls.alias("b"), lower("a.URL") == lower("b.URL"), "left")\
        .select("a.SessionNumber", "b.Product", "a.CsaNumber", "b.EndQuote", "a.URL")\
        .withColumnRenamed("Product", "Session")\
        .withColumn("Session", expr("CASE WHEN lower(URL) like 'https://mobilephones.com/deals/%' THEN 'Mobile Phones' ELSE Session END"))\
        .withColumn("EndQuote", expr("CASE WHEN lower(URL) like 'https://mobilephones.com/deals/%' THEN 'Mobile Phones' ELSE EndQuote END"))\
        .distinct()

   Goals_Dataset = Goals.select("SessionNumber", "GoalName", "PageInstanceID", "EventTimestamp")\
        .withColumnRenamed("EventTimestamp", "GoalDate")\
        .withColumnRenamed("PageInstanceID", "CsaNumber")\
        .select("SessionNumber", "GoalName", "CsaNumber", "GoalDate").alias("a")\
        .join(ConfiguredGoals.alias("b"), lower("a.GoalName") == lower("b.GoalNameValue"), "left")\
        .select("a.SessionNumber", coalesce("b.StartQuote", "b.EndQuote", "b.Switch").alias("Session"), "a.CsaNumber", "b.EndQuote")\
        .distinct()

   Session_Dataset = Pages_Dataset.select("SessionNumber", "Session", "CsaNumber", "EndQuote").alias("a")\
        .join(Goals_Dataset.alias("b"), "a.SessionNumber" == "b.SessionNumber", "fullouter")\
        .select(coalesce("a.SessionNumber", "b.SessionNumber").alias("SessionNumber"),  coalesce("a.Session", "b.Session").alias("Session"), coalesce("a.CsaNumber", "b.CsaNumber").alias("CsaNumber"), coalesce("a.EndQuote", "b.EndQuote").alias("EndQuote"))\
        .distinct()
#Error:
    Session_Dataset = Pages_Dataset.select("SessionNumber", "Session", "CsaNumber", "EndQuote").alias("a")\
      File "/usr/local/Cellar/apache-spark/3.2.1/libexec/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 1343, in join
    AssertionError: on should be Column or list of Column


Solution 1:[1]

"a.SessionNumber" == "b.SessionNumber" should be col("a.SessionNumber") == col("b.SessionNumber"), or just "SessionNumber"

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 pltc