'PythonException: 'pyarrow.lib.ArrowTypeError: Input object was not a NumPy array'

I am trying below code. but when I am displaying the returned dataframe I am getting above mentioned error.I am confused about what to be done. I have tried many ways but nothing is working.

for your information: dfs is a dataframe which is created from a table. gpd_poly is a geo dataframe created from a shape file. wkt_col is geometry column from dfs apply_col is just an id column from dfs

This code is working fine when the 2nd last statement is like return pd.DataFrame(gpd_joined.drop(columns = 'geometry')).

But I dont want drop this geometry column.hence when I write return pd.DataFrame(gpd_joined), I am getting error as "PythonException: 'pyarrow.lib.ArrowTypeError: Input object was not a NumPy array'"

'''

def joinWithGeo(dfs, gpd_poly, wkt_col, apply_col, local_projection = 'EPSG:3857', get_nearest = False):
  
  poly_fields = list(gpd_poly.columns.values)
  poly_fields.remove('geometry')
  return_schema = StructType.fromJson(dfs.schema.jsonValue())
  for field in poly_fields:
    return_schema = return_schema.add(field, 'string')
  
  # Project the poly in the local projection
  gpd_poly_projected = gpd_poly.to_crs(local_projection)
  
  def nearestPolygon(point_to_match):
    polygon_index = gpd_poly_projected.distance(point_to_match).sort_values().index[0]
    return gpd_poly_projected.loc[polygon_index, poly_fields]
  
  def udfJoinGeo(dfs):
  
    # Get the input data to geopandas
    gpd_dfs = gpd.GeoDataFrame(
    dfs,
    crs = "EPSG:4326", 
    geometry = gpd.GeoSeries.from_wkt(dfs[wkt_col])).to_crs(local_projection)
    
    # Left join with the data
    gpd_joined = leftSjoin(gpd_dfs, gpd_poly_projected)
    
    if get_nearest:
      # Get the missing data
      test_field = poly_fields[0]
      gpd_missing = gpd_joined[gpd_joined[test_field].isnull()]
      # Join it with the closest
      gpd_missing.loc[:, poly_fields] = gpd_missing.geometry.apply(lambda x: nearestPolygon(x))
      # Concat with the data not missed
      gpd_joined = gpd_joined[gpd_joined[test_field].notnull()]
      gpd_joined = pd.concat([gpd_joined, gpd_missing])
            
    return pd.DataFrame(gpd_joined.drop(columns = 'geometry'))
  return dfs.repartition(apply_col).groupby(col(apply_col)).applyInPandas(udfJoinGeo, return_schema)

'''



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source