function [model, loss,err]=train_multi(data, labels, maxiter, eta, momentum, seed, W)
rand('twister', seed);
%data is N x d
%labels is N x 1
num_data=size(data,1);
num_dims=size(data,2);



if(~exist('W', 'var'))
W=randn(num_dims+1, 1);

end

dWprev=0; 

data=[data ones(num_data,1)];
for iter=1:maxiter
	%compute outputs
	layer1u=data*W;
	layer1y=sigmoid(layer1u);
	%layer1 learn
	delta=(labels-layer1y).*layer1y.*(1-layer1y);
	dW=data'*delta;
	%update
	%size(dV)
	dW=momentum*eta*dW+(1-momentum)*dWprev;
	W=W+dW;	

	dWprev=dW;
	%update loss function
	loss(iter)=norm(labels-layer1y)^2;
%	plot(loss); xlim([1 maxiter]); ylim([0 20]);drawnow;
end
model.W=W;
pred=layer1y>0.5;
err=sum(labels~=pred);


function p=sigmoid(u)
p=1./(1+exp(-u));
