function [model, loss, err]=train_multi(data, labels, num_hidden, maxiter, eta, momentum, seed, W, V)
rand('twister', seed);
%data is N x d
%labels is N x 1
num_data=size(data,1);
num_dims=size(data,2);



if(~exist('W', 'var'))
W=randn(num_dims+1, num_hidden);
V=randn(num_hidden+1, 1);
end

dWprev=0; dVprev=0;

data=[data ones(num_data,1)];
for iter=1:maxiter
	%compute outputs
	layer1u=data*W;
	layer1y=sigmoid(layer1u);
	
	layer2u=[layer1y ones(num_data,1)]*V;
	layer2y=sigmoid(layer2u);
	
	%layer2 learn
	delta=(labels-layer2y).*layer2y.*(1-layer2y);
	dV=[layer1y ones(num_data,1)]'*delta;
	
	%layer1 learn
	delta1=(delta*V(1:end-1)').*layer1y.*(1-layer1y);
	dW=data'*delta1;

	%update
	%size(dV)
	dV=(1-momentum)*eta*dV+(momentum)*dVprev;
	dW=(1-momentum)*eta*dW+(momentum)*dWprev;
	V=V+dV;
	W=W+dW;	

	dVprev=dV;
	dWprev=dW;
	%update loss function
	loss(iter)=norm(labels-layer2y)^2;
	%plot(loss); xlim([1 maxiter]); ylim([0 20]);drawnow;
end
model.W=W;
model.V=V;
pred=layer2y>0.5;
err=sum(labels~=pred);

function p=sigmoid(u)
p=1./(1+exp(-u));
