'Custom ML Function in Pyspark

I have a class function in python which does a stats.gamma_cdf modelling. I wanted to convert ths into pandas UDF so that it can be used in Pyspark for a bigger data. Any insights will be really helpfull.

import scipy

import scipy.stats as stats 

class gamma_function:
    def __init__( self, a=None, b=None, p=None):
        self.a = a
        self.b = b
        self.p = p

    def _gamma_cdf_background(self, X, a, b, p):
        return  stats.gamma.cdf(X, a, b)*p

    def predict( self, X ):
        return self._gamma_cdf_background(X, self.a, self.b, self.p)

    def fit( self, X, y ):
        from scipy.optimize import curve_fit
        popt, pcov = curve_fit( self._gamma_cdf_background, X, y, bounds=([.25,0.1,.1], [3., 3.,3.]), tr_solver='lsmr')
        self.a = popt[0]
        self.b = popt[1]
        self.p = popt[2]
        return self

    def transform(self, X):
        return stats.gamma.cdf(X, self.a, self.b)*self.p
      
    def get_params( self, deep=False ):
        return { 'a':self.a, 'b':self.b, 'p':self.p }

    def set_params( self, **parameters ):
        for parameter, value in parameters.intems():
            setattr( self, parameter, value )
        return self


Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source