'Flux.jl : Customizing optimizer

I'm trying to implement a gradient-free optimizer function to train convolutional neural networks with Julia using Flux.jl. The reference paper is this: https://arxiv.org/abs/2005.05955. This paper proposes RSO, a gradient-free optimization algorithm updates single weight at a time on a sampling bases. The pseudocode of this algorithm is depicted in the picture below.

optimizer_pseudocode

I'm using MNIST dataset.

function train(; kws...)
args = Args(; kws...) # collect options in a stuct for convinience

if CUDA.functional() && args.use_cuda
    @info "Training on CUDA GPU"
    CUDA.allwoscalar(false)
    device = gpu
else
    @info "Training on CPU"
    device = cpu
end

# Prepare datasets
x_train, x_test, y_train, y_test = getdata(args, device)

# Create DataLoaders (mini-batch iterators)
train_loader = DataLoader((x_train, y_train), batchsize=args.batchsize, shuffle=true)
test_loader = DataLoader((x_test, y_test), batchsize=args.batchsize)

# Construct model
model = build_model() |> device
ps = Flux.params(model) # model's trainable parameters

best_param = ps
if args.optimiser == "SGD"
    # Regular training step with SGD

elseif args.optimiser == "RSO"
    # Run RSO function and update ps
    best_param .= RSO(x_train, y_train, args.RSOupdate, model, args.batchsize, device)
end

And the corresponding RSO function:

function RSO(X,L,C,model, batch_size, device)
"""
model = convolutional model structure
X = Input data
L = labels
C = Number of rounds to update parameters
W = Weight set of layers
Wd = Weight tensors of layer d that generates an activation
wid = weight tensor that generates an activation aᵢ
wj = a weight in wid
"""

# Normalize input data to have zero mean and unit standard deviation
X .= (X .- sum(X))./std(X)
train_loader = DataLoader((X, L), batchsize=batch_size, shuffle=true)

#println("model = $(typeof(model))")

std_prep = []
σ_d = Float64[]
D = 1
for layer in model
    D += 1
    Wd = Flux.params(layer)
    # Initialize the weights of the network with Gaussian distribution
    for id in Wd
        wj = convert(Array{Float32, 4}, rand(Normal(0, sqrt(2/length(id))), (3,3,4,4)))
        id = wj
        append!(std_prep, vec(wj))
    end
    # Compute std of all elements in the weight tensor Wd
    push!(σ_d, std(std_prep))
end

W = Flux.params(model)

# Weight update
for _ in 1:C
    d = D
    while d > 0
        for id in 1:length(W[d])
            # Randomly sample change in weights from Gaussian distribution
            for j in 1:length(w[d][id])
                # Randomly sample mini-batch
                (x, l) = train_loader[rand(1:length(train_loader))]
                
                # Sample a weight from normal distribution
                ΔWj[d][id][j] = rand(Normal(0, σ_d[d]), 1)

                loss, acc = loss_and_accuracy(data_loader, model, device)
                W = argmin(F(x,l, W+ΔWj), F(x,l,W), F(x,l, W-ΔWj))
            end
        end
        d -= 1
    end
end

return W
end

The problem here is the second block of the RSO function. I'm trying to evaluate the loss with the change of single weight in three scenarios, which are F(w, l, W+gW), F(w, l, W), F(w, l, W-gW), and choose the weight-set with minimum loss. But how do I do that using Flux.jl? The loss function I'm trying to use is logitcrossentropy(ŷ, y, agg=sum). In order to generate y_hat, we should use model(W), but changing single weight parameter in Zygote.Params() form was already challenging....



Solution 1:[1]

Thanks @darsnack :)

I found your answer a bit late, so in the meantime I could figure out my own script that works. Mine is indeed a bit hardcoded but could you also give feedback on this?

function RSO(train_loader, test_loader, C,model, batch_size, device, args)
"""
model = convolutional model structure
C = Number of rounds to update parameters (epochs)
batch_size = size of the mini batch that will be used to calculate loss
device = CPU or GPU
"""

# Evaluate initial weight
test_loss, test_acc = loss_and_accuracy(test_loader, model, device)
println("Initial Weight:")
println("   test_loss = $test_loss, test_accuracy = $test_acc")

random_batch = []
for (x, l) in train_loader
    push!(random_batch, (x,l))
end

# Initialize weights
std_prep = []
?_d = Float64[]
D = 0
for layer in model
    D += 1
    Wd = Flux.params(layer)
    # Initialize the weights of the network with Gaussian distribution
    for id in Wd
        if typeof(id) == Array{Float32, 4}
            wj = convert(Array{Float32, 4}, rand(Normal(0, sqrt(2/length(id))), size(id)))
        elseif typeof(id) == Vector{Float32}
            wj = convert(Vector{Float32}, rand(Normal(0, sqrt(2/length(id))), length(id)))
        elseif typeof(id) == Matrix{Float32}
            wj = convert(Matrix{Float32}, rand(Normal(0, sqrt(2/length(id))), size(id)))
        end
        id = wj
        append!(std_prep, vec(wj))
    end
    # Compute std of all elements in the weight tensor Wd
    push!(?_d, std(std_prep))
end

# Weight update
for c in 1:C
    d = D
    # First update the weights of the layer closest to the labels 
    # and then sequentially move closer to the input
    while d > 0
        Wd = Flux.params(model[d])
        for id in Wd
            # Randomly sample change in weights from Gaussian distribution
            for j in 1:length(id)
                # Randomly sample mini-batch
                (x, y) = rand(random_batch, 1)[1]
                x, y = device(x), device(y)
                
                # Sample a weight from normal distribution
                ?Wj = rand(Normal(0, ?_d[d]), 1)[1]

                # Weight update with three scenario
                ## F(x,l, W+?Wj)
                id[j] = id[j]+?Wj
                y? = model(x)
                ls_pos = logitcrossentropy(y?, y, agg=sum) / size(x)[end]

                ## F(x,l,W)
                id[j] = id[j]-?Wj
                y? = model(x)
                ls_org = logitcrossentropy(y?, y, agg=sum) / size(x)[end]

                ## F(x,l, W-?Wj)
                id[j] = id[j]-?Wj
                y? = model(x)
                ls_neg = logitcrossentropy(y?, y, agg=sum) / size(x)[end]

                # Check weight update that gives minimum loss
                min_loss = argmin([ls_org, ls_pos, ls_neg])

                # Save weight update with minimum loss
                if min_loss == 1
                    id[j] = id[j] + ?Wj
                elseif min_loss == 2
                    id[j] = id[j] + 2*?Wj
                elseif min_loss == 3
                    id[j] = id[j]
                end
    
            end
        end
        d -= 1
    end

    train_loss, train_acc = loss_and_accuracy(train_loader, model, device)
    test_loss, test_acc = loss_and_accuracy(test_loader, model, device)

    track!(args.tracker, test_acc)

    println("RSO Round=$c")
    println("   train_loss = $train_loss, train_accuracy = $train_acc")
    println("   test_loss = $test_loss, test_accuracy = $test_acc")

end

return Flux.params(model)
end

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Heesoo Song