This page lists ready to run shogun examples for the Static Matlab(tm) and Octave interface.
To run the examples issue
octave name_of_example.m
or start up matlab or octave and then type
name_of_example
Note that you have to make sure that the sg.oct or sg.mexglx (name varies with architecture) has to be in the matlab/octave path. This can be achieved using the addpath command:
addpath /path/to/octave
respectively
addpath /path/to/matlab
Finally note that for non-root installations you will have to make sure that libshogun and libshogun ui can be found by the dynamic linker, e.g. you will need to set:
LD_LIBRARY_PATH=path/to/libshogun:path/to/libshogunui
before starting matlab.
% Explicit examples on how to use the different classifiers size_cache=10; C=1.2; use_bias=false; epsilon=1e-5; width=2.1; max_train_time=60; addpath('tools'); label_train_multiclass=load_matrix('../data/label_train_multiclass.dat'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % GMNPSVM disp('GMNPSVM'); sg('new_classifier', 'GMNPSVM'); sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width); sg('set_features', 'TRAIN', fm_train_real); sg('set_labels', 'TRAIN', label_train_multiclass); sg('svm_epsilon', epsilon); sg('svm_use_bias', use_bias); sg('c', C); sg('train_classifier'); sg('set_features', 'TEST', fm_test_real); result=sg('classify');
% Explicit examples on how to use the different classifiers size_cache=10; C=1.2; use_bias=false; epsilon=1e-5; width=2.1; max_train_time=60; addpath('tools'); label_train_twoclass=load_matrix('../data/label_train_twoclass.dat'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % GPBTSVM disp('GPBTSVM'); sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width); sg('set_features', 'TRAIN', fm_train_real); sg('set_labels', 'TRAIN', label_train_twoclass); sg('new_classifier', 'GPBTSVM'); sg('svm_epsilon', epsilon); sg('svm_use_bias', use_bias); sg('c', C); sg('train_classifier'); sg('set_features', 'TEST', fm_test_real); result=sg('classify');
addpath('tools'); label_train_twoclass=load_matrix('../data/label_train_twoclass.dat'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % KNN disp('KNN'); sg('set_distance', 'EUCLIDIAN', 'REAL'); sg('set_features', 'TRAIN', fm_train_real); sg('set_labels', 'TRAIN', label_train_twoclass); sg('new_classifier', 'KNN'); sg('train_classifier', 3); sg('set_features', 'TEST', fm_test_real); result=sg('classify');
addpath('tools'); label_train_twoclass=load_matrix('../data/label_train_twoclass.dat'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % LDA disp('LDA'); sg('set_features', 'TRAIN', fm_train_real); sg('set_labels', 'TRAIN', label_train_twoclass); sg('new_classifier', 'LDA'); sg('train_classifier'); sg('set_features', 'TEST', fm_test_real); result=sg('classify');
C=1; use_bias=false; epsilon=1e-5; max_train_time=60; addpath('tools'); label_train_twoclass=load_matrix('../data/label_train_twoclass.dat'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % LibLinear disp('LibLinear'); sg('new_classifier', 'LIBLINEAR_LR'); sg('set_features', 'TRAIN', sparse(fm_train_real)); sg('set_labels', 'TRAIN', label_train_twoclass); sg('svm_epsilon', epsilon); sg('svm_use_bias', use_bias); sg('svm_max_train_time', max_train_time); sg('c', C); sg('train_classifier'); sg('set_features', 'TEST', sparse(fm_test_real)); result=sg('classify');
C=0.1; epsilon=1e-3; rand('state',17); num=1000; dim=20; dist=1; traindat=sparse([randn(dim,num/2)-dist, randn(dim,num/2)+dist]); trainlab=[-ones(1,num/2), ones(1,num/2) ]; sg('set_features', 'TRAIN', traindat); sg('set_labels', 'TRAIN', trainlab); sg('c', C); sg('svm_use_bias', false); sg('svm_epsilon', epsilon); sg('new_classifier', 'LIBLINEAR_L2'); tic; sg('train_classifier'); timeliblinear=toc [b,W]=sg('get_classifier'); sg('set_features', 'TEST', traindat); trainout=sg('classify'); trainerr=mean(trainlab~=sign(trainout)) b W' obj=sum(W.^2)+C*sum((1-trainlab.*(W'*traindat+b)).^2)
% Explicit examples on how to use the different classifiers size_cache=10; C=1; use_bias=false; epsilon=1e-5; width=2.1; addpath('tools'); label_train_twoclass=load_matrix('../data/label_train_twoclass.dat'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % LibSVM disp('LibSVM'); sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width); sg('set_features', 'TRAIN', fm_train_real); sg('set_labels', 'TRAIN', label_train_twoclass); sg('new_classifier', 'LIBSVM'); sg('svm_epsilon', epsilon); sg('svm_use_bias', use_bias); sg('c', C); sg('train_classifier'); sg('set_features', 'TEST', fm_test_real); result=sg('classify');
% Explicit examples on how to use the different classifiers size_cache=10; C=1.2; use_bias=false; epsilon=1e-5; width=2.1; addpath('tools'); label_train_multiclass=load_matrix('../data/label_train_multiclass.dat'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % LibSVM MultiClass disp('LibSVMMultiClass'); sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width); sg('set_features', 'TRAIN', fm_train_real); sg('set_labels', 'TRAIN', label_train_multiclass); sg('new_classifier', 'LIBSVM_MULTICLASS'); sg('svm_epsilon', epsilon); sg('svm_use_bias', use_bias); sg('c', C); sg('train_classifier'); sg('set_features', 'TEST', fm_test_real); result=sg('classify');
% Explicit examples on how to use the different classifiers size_cache=10; C=1.2; use_bias=false; epsilon=1e-5; width=2.1; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % LibSVM OneClass disp('LibSVMOneClass'); sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width); sg('set_features', 'TRAIN', fm_train_real); sg('new_classifier', 'LIBSVM_ONECLASS'); sg('svm_epsilon', epsilon); sg('svm_use_bias', use_bias); sg('c', C); sg('train_classifier'); sg('set_features', 'TEST', fm_test_real); result=sg('classify');
C=100; epsilon=1e-3; rand('state',17); num=1000; dim=20; dist=1; traindat=sparse([rand(dim,num/2)-4*dist, rand(dim,num/2)-dist]); trainlab=[-ones(1,num/2), ones(1,num/2) ]; sg('set_features', 'TRAIN', traindat); sg('set_labels', 'TRAIN', trainlab); sg('c', C); sg('svm_use_bias', false); sg('svm_epsilon', epsilon); sg('new_classifier', 'LPBOOST'); tic; sg('train_classifier'); timelpboost=toc [b,W]=sg('get_classifier'); sg('set_features', 'TEST', traindat); trainout=sg('classify'); trainerr=mean(trainlab~=sign(trainout)) b W' obj=sum(abs(W))+C*sum(max(0,1-trainlab.*(W'*traindat+b)))
C=100; epsilon=1e-3; rand('state',17); num=1000; dim=20; dist=1; traindat=sparse([rand(dim,num/2)-4*dist, rand(dim,num/2)-dist]); trainlab=[-ones(1,num/2), ones(1,num/2) ]; sg('set_features', 'TRAIN', traindat); sg('set_labels', 'TRAIN', trainlab); sg('c', C); sg('svm_use_bias', true); sg('new_classifier', 'LPM'); tic; sg('train_classifier'); timelpm=toc [b,W]=sg('get_classifier'); sg('set_features', 'TEST', traindat); trainout=sg('classify'); trainerr=mean(trainlab~=sign(trainout)) b W' obj=sum(abs(W))+C*sum(max(0,1-trainlab.*(W'*traindat+b)))
size_cache=10; C=1.2; use_bias=false; epsilon=1e-5; width=2.1; addpath('tools'); label_train_twoclass=load_matrix('../data/label_train_twoclass.dat'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % MPDSVM disp('MPDSVM'); sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width); sg('set_features', 'TRAIN', fm_train_real); sg('set_labels', 'TRAIN', label_train_twoclass); sg('new_classifier', 'MPDSVM'); sg('svm_epsilon', epsilon); sg('svm_use_bias', use_bias); sg('c', C); sg('train_classifier'); sg('set_features', 'TEST', fm_test_real); result=sg('classify');
addpath('tools'); % Perceptron disp('Perceptron'); % create some seperable toy data num=50; label_train_twoclass=[-ones(1,num/2) ones(1,num/2)]; fm_train_real=[randn(5,num/2)-1, randn(5,num/2)+1]; fm_test_real=[randn(5,num)-1, randn(5,num)+1]; sg('set_features', 'TRAIN', fm_train_real); sg('set_labels', 'TRAIN', label_train_twoclass); sg('new_classifier', 'PERCEPTRON'); %sg('set_perceptron_parameters', 1.6, 5000); sg('train_classifier'); sg('set_features', 'TEST', fm_test_real); result=sg('classify');
C=1.2; use_bias=false; epsilon=1e-5; width=2.1; max_train_time=60; addpath('tools'); label_train_twoclass=load_matrix('../data/label_train_twoclass.dat'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % SubgradientSVM - often does not converge disp('SubGradientSVM'); C=0.9; sg('set_features', 'TRAIN', sparse(fm_train_real)); sg('set_labels', 'TRAIN', label_train_twoclass); sg('new_classifier', 'SUBGRADIENTSVM'); sg('svm_epsilon', epsilon); sg('svm_use_bias', use_bias); sg('svm_max_train_time', max_train_time); sg('c', C); % sometimes does not terminate %sg('train_classifier'); %sg('set_features', 'TEST', sparse(fm_test_real)); %result=sg('classify');
C=1.2; use_bias=false; epsilon=1e-5; addpath('tools'); label_train_dna=load_matrix('../data/label_train_dna.dat'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); % SVMLight try disp('SVMLight'); degree=20; sg('set_kernel', 'WEIGHTEDDEGREE', 'CHAR', size_cache, degree); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); sg('set_labels', 'TRAIN', label_train_dna); sg('new_classifier', 'SVMLIGHT'); sg('svm_epsilon', epsilon); sg('svm_use_bias', use_bias); sg('c', C); sg('train_classifier'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); result=sg('classify'); catch disp('No support for SVMLight available.') end
rand('seed',17); %sequence lengths, number of sequences len=200; num_train=500; num_test=500; num_a=2; aa=(round(len/2-num_a/2)):(round(len/2+num_a/2-1)); %SVM regularization factor C C=1; %Weighted Degree kernel parameters max_order=5; order=15 max_mismatch=0; cache=100; normalize=true; mkl_stepsize=1; block=0; single_degree=-1; %generate some toy data acgt='ACGT'; rand('state',1); traindat=acgt(ceil(4*rand(len,num_train))); trainlab=[-ones(1,num_train/2),ones(1,num_train/2)]; traindat(aa,trainlab==1)='A'; testdat=acgt(ceil(4*rand(len,num_test))); testlab=[-ones(1,num_test/2),ones(1,num_test/2)]; testdat(aa,testlab==1)='A'; %traindat' %input('key to continue') %train svm sg('use_linadd', true); sg('use_batch_computation', false); sg('set_features', 'TRAIN', traindat, 'DNA'); sg('set_labels', 'TRAIN', trainlab); sg('set_kernel', 'WEIGHTEDDEGREE', 'CHAR', cache, order, max_mismatch, normalize, mkl_stepsize, block, single_degree); %sg('set_WD_position_weights', ones(1,100)/100) ; %sg('set_WD_position_weights', ones(1,200)/200) ; sg('new_classifier', 'SVMLIGHT'); sg('c',C); tic;sg('train_classifier');toc; %evaluate svm on test data sg('set_features', 'TEST', testdat, 'DNA'); sg('set_labels', 'TEST', testlab); %sg('init_kernel_optimization'); %sg('delete_kernel_optimization'); sg('use_batch_computation', true); sg('delete_kernel_optimization'); out1=sg('classify'); fprintf('accuracy: %f \n', mean(sign(out1)==testlab)) sg('use_batch_computation', true); out2=sg('classify'); fprintf('accuracy: %f \n', mean(sign(out2)==testlab)) sg('use_batch_computation', false); tic;sg('init_kernel_optimization');toc; %sg('delete_kernel_optimization'); tic;out3=sg('classify');toc; fprintf('accuracy: %f \n', mean(sign(out3)==testlab)) max(abs(out1-out2)) max(abs(out1-out3))
rand('seed',17); %sequence lengths, number of sequences len=100; num_train=200; num_test=300; num_a=3; aa=(round(len/2-num_a/2)):(round(len/2+num_a/2-1)); %SVM regularization factor C C=1; %Weighted Degree kernel parameters max_order=8; order=20; shift=10 ; max_mismatch=0; cache=100; single_degree=-1; x=shift*rand(1,len); %x(:)=0; shifts = int32(floor(x(end:-1:1))); % suboptimal position weights: posweights = double(floor(x(end:-1:1))); %generate some toy data acgt='ACGT'; rand('state',1); traindat=acgt(ceil(4*rand(len,num_train))); trainlab=[-ones(1,num_train/2),ones(1,num_train/2)]; aas=floor((shift+1)*rand(num_train,1)); idx=find(trainlab==1); for i=1:length(idx), traindat(aa+aas(i),idx(i))='A'; end testdat=acgt(ceil(4*rand(len,num_test))); testlab=[-ones(1,num_test/2),ones(1,num_test/2)]; aas=floor((shift+1)*rand(num_test,1)); idx=find(testlab==1); for i=1:length(idx), testdat(aa+aas(i),idx(i))='A'; end %traindat=traindat(1:5,:) ; %testdat=testdat(1:5,:) ; %len=5 ; traindat(end,end)='A' ; %traindat' %input('key to continue') %train svm sg('use_linadd', true); sg('use_batch_computation', true); sg('set_features', 'TRAIN', traindat,'DNA'); sg('set_labels', 'TRAIN', trainlab); %sg('set_kernel', 'WEIGHTEDDEGREEPOS2', 'CHAR', 10, order, max_mismatch, len, shifts); sg('set_kernel', 'WEIGHTEDDEGREEPOS3', 'CHAR', 10, order, max_mismatch, len, 1, shifts); %sg('set_kernel', 'WEIGHTEDDEGREEPOS3', 'CHAR', 10, order, max_mismatch, len, 1, shifts, posweights); %sg('set_kernel', 'WEIGHTEDDEGREE', 'CHAR', cache, order, max_mismatch, normalize, mkl_stepsize, block, single_degree); %sg('set_kernel', 'WEIGHTEDDEGREE', 'CHAR', cache, order); %sg('set_WD_position_weights', ones(1,100)/100) ; sg('new_classifier', 'SVMLIGHT'); sg('c',C); sg('train_classifier'); %w=sg('get_subkernel_weights') ; %w(1:3)=1 ; %w(2:3)=0 ; %w(3)=1 ; %sg('set_subkernel_weights',w) ; %z=cell(); z{10}=''; %for i=1:10; % z{i}=traindat(:,i)'; %end %sg('set_features', 'TEST', z,'DNA'); sg('set_features', 'TEST', testdat,'DNA'); sg('set_labels', 'TEST', testlab); sg('use_batch_computation', false); sg('delete_kernel_optimization'); out1=sg('classify'); fprintf('accuracy: %f \n', mean(sign(out1)==testlab)) sg('set_kernel_optimization_type', 'SLOWBUTMEMEFFICIENT'); sg('use_batch_computation', true); sg('delete_kernel_optimization'); sg('train_classifier') out2=sg('classify'); fprintf('accuracy: %f \n', mean(sign(out2)==testlab)) sg('set_kernel_optimization_type', 'FASTBUTMEMHUNGRY'); sg('use_batch_computation', true); sg('delete_kernel_optimization'); out3=sg('classify'); fprintf('accuracy: %f \n', mean(sign(out3)==testlab)) sg('set_kernel_optimization_type', 'SLOWBUTMEMEFFICIENT'); %sg('set_kernel_optimization_type', 'FASTBUTMEMHUNGRY'); sg('use_batch_computation', false); tic;sg('init_kernel_optimization');toc; %sg('delete_kernel_optimization'); tic;out4=sg('classify');toc; fprintf('accuracy: %f \n', mean(sign(out4)==testlab)) sg('set_kernel_optimization_type', 'FASTBUTMEMHUNGRY'); sg('use_batch_computation', false); tic;sg('init_kernel_optimization');toc; %sg('delete_kernel_optimization'); tic;out5=sg('classify');toc; fprintf('accuracy: %f \n', mean(sign(out5)==testlab)) max(abs(out1-out2)) max(abs(out1-out3)) max(abs(out1-out4)) max(abs(out1-out5)) %max(abs(out2-out3)) %xmax(abs(out3-out4)) return %evaluate svm on train data sg('set_features', 'TEST', traindat,'DNA'); sg('set_labels', 'TEST', trainlab); out=sg('classify'); fprintf('accuracy: %f \n', mean(sign(out)==trainlab)) %evaluate svm on test data sg('set_features', 'TEST', testdat,'DNA'); sg('set_labels', 'TEST', testlab); out=sg('classify'); fprintf('accuracy: %f \n', mean(sign(out)==testlab))
C=1.2; use_bias=false; epsilon=1e-5; max_train_time=60; addpath('tools'); label_train_twoclass=load_matrix('../data/label_train_twoclass.dat'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % SVMLin disp('SVMLin'); sg('set_features', 'TRAIN', sparse(fm_train_real)); sg('set_labels', 'TRAIN', label_train_twoclass); sg('new_classifier', 'SVMLIN'); sg('svm_epsilon', epsilon); sg('svm_use_bias', use_bias); sg('svm_max_train_time', max_train_time); sg('c', C); sg('train_classifier'); sg('set_features', 'TEST', sparse(fm_test_real)); result=sg('classify');
C=1.2; use_bias=false; epsilon=1e-5; max_train_time=60; addpath('tools'); label_train_twoclass=load_matrix('../data/label_train_twoclass.dat'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % SVMOcas disp('SVMOcas'); sg('new_classifier', 'SVMOCAS'); sg('set_features', 'TRAIN', sparse(fm_train_real)); sg('set_labels', 'TRAIN', label_train_twoclass); sg('svm_epsilon', epsilon); sg('svm_use_bias', use_bias); sg('svm_max_train_time', max_train_time); sg('c', C); sg('train_classifier'); sg('set_features', 'TEST', sparse(fm_test_real)); result=sg('classify');
C=10; epsilon=1e-3; rand('state',17); num=16; dim=10; dist=0.001; traindat=[rand(dim,num/2)-dist, rand(dim,num/2)+dist]; scale=(dim*mean(traindat(:))); traindat=sparse(traindat/scale); trainlab=[-ones(1,num/2), +ones(1,num/2) ]; sg('set_features', 'TRAIN', traindat); sg('set_labels', 'TRAIN', trainlab); sg('c', C); sg('svm_use_bias', false); sg('svm_bufsize', 1000); sg('svm_epsilon', epsilon); sg('new_classifier', 'SVMOCAS'); tic; sg('train_classifier'); timeocas=toc [b,W]=sg('get_classifier'); sg('set_features', 'TEST', traindat); trainout=sg('classify'); trainerr=mean(trainlab~=sign(trainout)) sg('new_classifier', 'SVMOCAS'); sg('set_linear_classifier', b, W'); sg('set_features', 'TEST', traindat); trainout2=sg('classify'); trainerr2=mean(trainlab~=sign(trainout2)) max(abs(trainout-trainout2)) b W' obj=sum(W.^2)+C*sum((1-trainlab.*(W'*traindat+b)).^2)
C=1.2; use_bias=false; epsilon=1e-5; max_train_time=60; addpath('tools'); label_train_twoclass=load_matrix('../data/label_train_twoclass.dat'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % SVMSGD disp('SVMSGD'); sg('set_features', 'TRAIN', sparse(fm_train_real)); sg('set_labels', 'TRAIN', label_train_twoclass); sg('new_classifier', 'SVMSGD'); sg('svm_epsilon', epsilon); sg('svm_use_bias', use_bias); sg('svm_max_train_time', max_train_time); sg('c', C); sg('train_classifier'); sg('set_features', 'TEST', sparse(fm_test_real)); result=sg('classify');
C=10; rand('state',17); num=16; dim=10; dist=0.001; traindat=[rand(dim,num/2)-dist, rand(dim,num/2)+dist]; scale=(dim*mean(traindat(:))); traindat=sparse(traindat/scale); trainlab=[-ones(1,num/2), +ones(1,num/2) ]; sg('set_features', 'TRAIN', traindat); sg('set_labels', 'TRAIN', trainlab); sg('c', C); sg('svm_use_bias', false); sg('new_classifier', 'SVMSGD'); tic; sg('train_classifier'); timesgd=toc [b,W]=sg('get_classifier'); sg('set_features', 'TEST', traindat); trainout=sg('classify'); trainerr=mean(trainlab~=sign(trainout)) b W' obj=sum(W.^2)+C*sum((1-trainlab.*(W'*traindat+b)).^2)
addpath('tools'); fm_train=load_matrix('../data/fm_train_real.dat'); % KMEANS disp('KMeans'); k=3; iter=1000; sg('set_features', 'TRAIN', fm_train); sg('set_distance', 'EUCLIDIAN', 'REAL'); sg('new_clustering', 'KMEANS'); sg('train_clustering', k, iter); [radi, centers]=sg('get_clustering');
addpath('tools'); fm_train=load_matrix('../data/fm_train_real.dat'); % Hierarchical disp('Hierarchical'); merges=3; sg('set_features', 'TRAIN', fm_train); sg('set_distance', 'EUCLIDIAN', 'REAL'); sg('new_clustering', 'HIERARCHICAL'); sg('train_clustering', merges); [merge_distance, pairs]=sg('get_clustering');
addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % BrayCurtis Distance disp('BrayCurtisDistance'); sg('set_distance', 'BRAYCURTIS', 'REAL'); sg('set_features', 'TRAIN', fm_train_real);; dm=sg('get_distance_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real);; dm=sg('get_distance_matrix', 'TEST');
addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Canberra Metric disp('CanberraMetric'); sg('set_distance', 'CANBERRA', 'REAL'); sg('set_features', 'TRAIN', fm_train_real); dm=sg('get_distance_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real);; dm=sg('get_distance_matrix', 'TEST');
addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); order=3; gap=0; reverse='n'; % CanberraWord Distance disp('CanberraWordDistance'); sg('set_distance', 'CANBERRA', 'WORD'); sg('add_preproc', 'SORTWORDSTRING'); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('attach_preproc', 'TRAIN'); dm=sg('get_distance_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('attach_preproc', 'TEST'); dm=sg('get_distance_matrix', 'TEST');
addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Chebyshew Metric disp('ChebyshewMetric'); sg('set_distance', 'CHEBYSHEW', 'REAL'); sg('set_features', 'TRAIN', fm_train_real); dm=sg('get_distance_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real);; dm=sg('get_distance_matrix', 'TEST');
addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Chi Square Metric disp('ChiSquareDistance'); sg('set_distance', 'CHISQUARE', 'REAL'); sg('set_features', 'TRAIN', fm_train_real); dm=sg('get_distance_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real);; dm=sg('get_distance_matrix', 'TEST');
addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Cosine Distance disp('CosineDistance'); sg('set_distance', 'COSINE', 'REAL'); sg('set_features', 'TRAIN', fm_train_real); dm=sg('get_distance_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real);; dm=sg('get_distance_matrix', 'TEST');
addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Euclidian Distance disp('EuclidianDistance'); sg('set_distance', 'EUCLIDIAN', 'REAL'); sg('set_features', 'TRAIN', fm_train_real);; dm=sg('get_distance_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real);; dm=sg('get_distance_matrix', 'TEST');
addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Geodesic Metric disp('GeodesicMetric'); sg('set_distance', 'GEODESIC', 'REAL'); sg('set_features', 'TRAIN', fm_train_real); dm=sg('get_distance_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real);; dm=sg('get_distance_matrix', 'TEST');
addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); order=3; gap=0; reverse='n'; % bit silly to not use boolean, set 'r' to yield true % HammingWord Distance disp('HammingWordDistance'); sg('set_distance', 'HAMMING', 'WORD'); sg('add_preproc', 'SORTWORDSTRING'); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('attach_preproc', 'TRAIN'); dm=sg('get_distance_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('attach_preproc', 'TEST'); dm=sg('get_distance_matrix', 'TEST');
addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Jensen Metric disp('JensenMetric'); sg('set_distance', 'JENSEN', 'REAL'); sg('set_features', 'TRAIN', fm_train_real); dm=sg('get_distance_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real);; dm=sg('get_distance_matrix', 'TEST');
addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Manhattan Metric disp('ManhattanMetric'); sg('set_distance', 'MANHATTAN', 'REAL'); sg('set_features', 'TRAIN', fm_train_real); dm=sg('get_distance_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real);; dm=sg('get_distance_matrix', 'TEST');
addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); order=3; gap=0; reverse='n'; % ManhattanWord Distance disp('ManhattanWordDistance'); sg('set_distance', 'MANHATTAN', 'WORD'); sg('add_preproc', 'SORTWORDSTRING'); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('attach_preproc', 'TRAIN'); dm=sg('get_distance_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('attach_preproc', 'TEST'); dm=sg('get_distance_matrix', 'TEST');
addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Minkowski Metric disp('MinkowskiMetric'); k=3; sg('set_distance', 'MINKOWSKI', 'REAL', k); sg('set_features', 'TRAIN', fm_train_real); dm=sg('get_distance_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real);; dm=sg('get_distance_matrix', 'TEST');
addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Tanimoto Metric disp('TanimotoDistance'); sg('set_distance', 'TANIMOTO', 'REAL'); sg('set_features', 'TRAIN', fm_train_real); dm=sg('get_distance_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real);; dm=sg('get_distance_matrix', 'TEST');
% Explicit examples on how to use distributions leng=50; rep=5; weight=1; order=3; gap=0; num=12; len=23; reverse='n'; % bit silly to not use boolean, set 'r' to yield true addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); % Histogram disp('Histogram'); %sg('new_distribution', 'HISTOGRAM'); sg('add_preproc', 'SORTWORDSTRING'); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('attach_preproc', 'TRAIN'); % sg('train_distribution'); % histo=sg('get_histogram'); % num_param=sg('get_histogram_num_model_parameters'); % for i = 1:num, % for j = 1:num_param, % sg(sprintf('get_log_derivative %d %d', j, i)); % end % end % sg('get_log_likelihood'); % sg('get_log_likelihood_sample');
leng=50; rep=5; weight=1; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); % HMM disp('HMM'); N=3; M=6; % generate a sequence with characters 1-6 drawn from 3 loaded cubes for i = 1:3, a{i}= [ ones(1,ceil(leng*rand)) 2*ones(1,ceil(leng*rand)) 3*ones(1,ceil(leng*rand)) 4*ones(1,ceil(leng*rand)) 5*ones(1,ceil(leng*rand)) 6*ones(1,ceil(leng*rand)) ]; a{i}= a{i}(randperm(length(a{i}))); end s=[]; for i = 1:size(a,2), s= [ s i*ones(1,ceil(rep*rand)) ]; end s=s(randperm(length(s))); sequence={''}; for i = 1:length(s), f(i)=ceil(((1-weight)*rand+weight)*length(a{s(i)})); t=randperm(length(a{s(i)})); r=a{s(i)}(t(1:f(i))); sequence{1}=[sequence{1} char(r+'0')]; end sg('new_hmm', N, M); sg('set_features','TRAIN', sequence, 'CUBE'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', 1); sg('bw'); [p, q, a, b]=sg('get_hmm'); sg('new_hmm', N, M); sg('set_hmm', p, q, a, b); likelihood=sg('hmm_likelihood');
order=3; gap=0; num=12; reverse='n'; % bit silly to not use boolean, set 'r' to yield true addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); % LinearHMM disp('LinearHMM'); %sg('new_distribution', 'LinearHMM'); sg('add_preproc', 'SORTWORDSTRING'); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('attach_preproc', 'TRAIN'); % sg('train_distribution'); % histo=sg('get_histogram'); % num_param=sg('get_histogram_num_model_parameters'); % for i = 1:num, % for j = 1:num_param, % sg(sprintf('get_log_derivative %d %d', j, i)); % end % end % sg('get_log_likelihood'); % sg('get_log_likelihood_sample');
seqlen=100; numseq=50000; order=2; %max 8, markov chain has in fact of order-1 ppseudo=1e-5; npseudo=10; motifidx=10:21; acgt='ACGT'; rand('state', 17); LT=[-ones(1,numseq), ones(1,numseq)]; XT=acgt(ceil(3*rand(seqlen,2*numseq))); XT(motifidx,LT==1)='T'; LV=[-ones(1,numseq), ones(1,numseq)]; XV=acgt(ceil(3*rand(seqlen,2*numseq))); XV(motifidx,LV==1)='T'; sg('set_features', 'TRAIN', XT(:,LT==1), 'DNA') ; sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order); sg('pseudo', ppseudo); sg('new_hmm', size(XT,1), 4^order); sg('linear_train'); [p_p,q_p,a_p,b_p]=sg('get_hmm'); sg('set_features', 'TEST', XV, 'DNA') ; sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order); posout=sg('one_class_linear_hmm_classify'); sg('set_features', 'TRAIN', XT(:,LT==-1), 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order); sg('pseudo', npseudo); sg('new_hmm', size(XT,1), 4^order); sg('linear_train'); [p_n,q_n,a_n,b_n]=sg('get_hmm'); sg('set_features', 'TEST', XV, 'DNA') ; sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order); negout=sg('one_class_linear_hmm_classify'); output=posout-negout; err=mean(sign(output)~=LV)
degree = 2; traindat = [rand(10,50)-1 2+rand(10,50)+1]; testdat = [rand(10,50)-1 2+rand(10,50)+1]; trainlab = [ones(1, 50) -ones(1, 50)]; C=1; size_cache=10; epsilon=1e-5; sg('set_kernel', 'POLY', 'REAL', size_cache, degree); %sg('set_kernel_normalization', 'IDENTITY'); sg('set_features', 'TRAIN', traindat); sg('set_labels', 'TRAIN', trainlab); sg('new_classifier', 'SVMLIGHT'); sg('svm_epsilon', epsilon); sg('svm_use_bias', 0); sg('c', C); km=sg('get_kernel_matrix', 'TRAIN'); tic; sg('train_classifier'); toc sg('set_features', 'TEST', testdat); result=sg('classify'); normalize=1; sg('loglevel', 'DEBUG'); sg('svm_use_bias', 0); sg('set_features', 'TRAIN', traindat, 'POLY', degree, normalize); x = sg('get_features', 'TRAIN'); km2=x'*x; sg('c', C); sg('svm_epsilon', epsilon); sg('svm_use_bias', 0); sg('new_classifier', 'SVMOCAS'); sg('train_classifier'); sg('set_features', 'TEST', testdat, 'POLY', degree, normalize); out_wdocas=sg('classify');
C=1; order=6; degree=order; from_order=6; max_mismatch=0; cache=100; normalize=1; mkl_stepsize=1; block=1; single_degree=-1; epsilon=1e-5; rand('seed',17); %sequence lengths, number of sequences len=20; num_train=10; num_a=5; aa=(round(len/2-num_a/2)):(round(len/2+num_a/2-1)); epsilon=1e-6; %generate some toy data acgt='ACGT'; shift=1; rand('state',1); traindat=acgt(ceil(4*rand(len,num_train))); trainlab=[-ones(1,num_train/2),ones(1,num_train/2)]; aas=floor((shift+1)*rand(num_train,1)); idx=find(trainlab==1); for i=1:length(idx), traindat(aa+aas(i),idx(i))='A'; end testdat=traindat; testlab=trainlab; %train svm sg('threads',1); sg('use_linadd', 1); sg('use_batch_computation', 1); sg('progress', 'ON'); sg('set_features', 'TRAIN', traindat, 'DNA'); sg('set_labels', 'TRAIN', trainlab); sg('svm_use_bias', 0); sg('new_classifier', 'LIGHT'); sg('set_kernel', 'WEIGHTEDDEGREE', 'CHAR', cache, from_order, max_mismatch, normalize, mkl_stepsize, block, single_degree); %x=sg('get_subkernel_weights'); % %sg(sprintf( 'set_kernel WEIGHTEDDEGREE CHAR %i %i %i %i %i %i %i', cache, order, max_mismatch, 0, mkl_stepsize, block, single_degree) ); %sg('set_subkernel_weights',x(1:order)); % %%kmu=sg('get_kernel_matrix', 'TRAIN'); % %sg(sprintf( 'set_kernel WEIGHTEDDEGREE CHAR %i %i %i %i %i %i %i', cache, order, max_mismatch, normalize, mkl_stepsize, block, single_degree) ); %sg('set_subkernel_weights',x(1:order)); %%km=sg('get_kernel_matrix', 'TRAIN'); %sg('new_classifier LIGHT'); sg('c',C); tic; sg('svm_train'); tim_lo=toc; %evaluate svm on test data sg('set_features', 'TEST', testdat, 'DNA'); out_ref=sg('svm_classify'); %prc_ref=calcrfcscore(out_ref, testlab); %roc_ref=calcrocscore(out_ref, testlab); traindat(traindat=='A')=0; traindat(traindat=='C')=1; traindat(traindat=='G')=2; traindat(traindat=='T')=3; traindat=uint8(traindat); testdat(testdat=='A')=0; testdat(testdat=='C')=1; testdat(testdat=='G')=2; testdat(testdat=='T')=3; testdat=uint8(testdat); sg('set_features', 'TRAIN', traindat', 'RAWDNA'); sg('set_labels', 'TRAIN', trainlab); sg('c',C); sg('svm_epsilon', epsilon); sg('new_classifier','WDSVMOCAS',order, from_order); tic; sg('svm_train'); tim_lo=toc; %evaluate svm on test data sg('set_features', 'TEST', testdat, 'RAWDNA'); out=sg('svm_classify'); %prc=calcrfcscore(out, testlab); %roc=calcrocscore(out, testlab); sg('set_features', 'TRAIN', traindat, 'RAWDNA', 'WD', order, from_order); sg('set_labels', 'TRAIN', trainlab); sg('c', C); sg('svm_epsilon', epsilon); sg('svm_use_bias', 0); sg('new_classifier', 'SVMOCAS'); sg('train_classifier'); sg('set_features', 'TEST', testdat, 'RAWDNA', 'WD', order, from_order); out_wdocas=sg('classify'); max(abs(out-out_ref)) max(abs(out_wdocas-out_ref)) max(abs(out_wdocas-out)) dat=[]; weights=sqrt((degree:-1:1)/sum(degree:-1:1))/4.281744; N = size(traindat,1); nDim = 0; for d = 1:degree, nDim = nDim + 4^d; end nDim = nDim*N; for j=1:size(traindat,2), dat(:,j)= zeros(nDim,1); offset = 0; for i=1:N, val = 0; for d = 1:degree if i+d-1<=N, val = 4*val + double(traindat(i+d-1,j)); dat(offset+val+1,j) = weights(d); offset = offset + 4^d; end end end end traindat=sparse(dat); testdat=traindat; sg('set_features', 'TRAIN', traindat); sg('set_labels', 'TRAIN', trainlab); sg('c', C); sg('svm_epsilon', epsilon); sg('svm_use_bias', 0); sg('new_classifier', 'SVMOCAS'); sg('train_classifier'); sg('set_features', 'TEST', traindat); out_ocas=sg('classify'); sg('set_features', 'TRAIN', dat); sg('set_labels', 'TRAIN', trainlab); sg('c', C); sg('svm_epsilon', epsilon); sg('svm_use_bias', 0); sg('new_classifier', 'SVMOCAS'); sg('train_classifier'); sg('set_features', 'TEST', dat); out_docas=sg('classify'); max(abs(out-out_ocas)) max(abs(out-out_ref)) max(abs(out_ocas-out_ref)) max(abs(out_ocas-out_docas)) sg('set_features', 'TRAIN', [traindat;2*traindat]); sg('set_labels', 'TRAIN', trainlab); sg('c', C); sg('svm_epsilon', epsilon); sg('svm_use_bias', 0); sg('new_classifier', 'SVMOCAS'); sg('train_classifier'); sg('set_features', 'TEST', [traindat;2*traindat]); out1=sg('classify'); sg('clean_features','TRAIN'); sg('clean_features','TEST'); sg('add_dotfeatures', 'TRAIN', traindat); sg('add_dotfeatures', 'TRAIN', 2*dat); sg('set_labels', 'TRAIN', trainlab); sg('c', C); sg('svm_epsilon', epsilon); sg('svm_use_bias', 0); sg('new_classifier', 'SVMOCAS'); sg('train_classifier'); sg('add_dotfeatures', 'TEST', traindat); sg('add_dotfeatures', 'TEST', 2*dat); out2=sg('classify'); max(abs(out1-out2))
rand('seed',17); %sequence lengths, number of sequences len=100; num_train=10; num_a=5; aa=(round(len/2-num_a/2)):(round(len/2+num_a/2-1)); epsilon=1e-8; %SVM regularization factor C C=1; %Spectrum kernel parameters order=8; cache=10; use_sign=false; normalize=true; if normalize, normalization='FULL'; %NO,SQRT,LEN,SQLEN,FULL else normalization='NO'; %NO,SQRT,LEN,SQLEN,FULL end %generate some toy data acgt='ACGT'; shift=40; rand('state',1); traindat=acgt(ceil(4*rand(len,num_train))); trainlab=[-ones(1,num_train/2),ones(1,num_train/2)]; aas=floor((shift+1)*rand(num_train,1)); idx=find(trainlab==1); for i=1:length(idx), traindat(aa+aas(i),idx(i))='A'; end sg('loglevel', 'ALL'); %%% spec weights=(order:-1:1); weights=weights/sum(weights); km=zeros(size(traindat,2)); for o=1:order, sg('set_features', 'TRAIN', traindat, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', o, order-1); sg('add_preproc', 'SORTWORDSTRING'); sg('attach_preproc', 'TRAIN'); sg('set_kernel', 'COMMSTRING', 'WORD',cache, use_sign, "NO"); km=km+weights(o)*sg('get_kernel_matrix', 'TRAIN'); end km2=km; if normalize, for i=1:size(km,1), for j=1:size(km,2), km2(i,j)=km(i,j)/(sqrt(km(i,i)*km(j,j))); end end end %%% wdspec sg('set_features', 'TRAIN', traindat, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, 0, 'r'); sg('add_preproc', 'SORTWORDSTRING'); sg('attach_preproc', 'TRAIN'); sg('set_kernel', 'WEIGHTEDCOMMSTRING', 'WORD', cache, use_sign, normalization); feat=sg('get_features','TRAIN'); wkm=sg('get_kernel_matrix', 'TRAIN'); fprintf('max diff %g\n', max(abs(wkm(:)-km2(:)))) sg('c', C); sg('svm_epsilon', epsilon); sg('svm_use_bias', 0); sg('use_linadd', true); sg('new_classifier', 'SVMLIGHT'); sg('set_labels','TRAIN', trainlab); sg('train_classifier'); [bias, alphas]=sg('get_classifier'); sg('init_kernel_optimization'); svmw=sg('get_kernel_optimization'); sg('set_features', 'TEST', traindat, 'DNA'); sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, 0, 'r'); sg('add_preproc', 'SORTWORDSTRING'); sg('attach_preproc', 'TEST'); out_ref=sg('classify'); sg('c', C); sg('clean_features', 'TRAIN'); sg('clean_features', 'TEST'); sg('svm_epsilon', epsilon); sg('svm_use_bias', 0); sg('use_linadd', false); sg('new_classifier', 'SVMLIGHT'); sg('set_features', 'TRAIN', traindat, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1); sg('set_labels','TRAIN', trainlab); sg('set_kernel','CUSTOM', km2, 'FULL'); sg('train_classifier'); sg('set_features', 'TEST', traindat, 'DNA'); sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1); out_ref2=sg('classify'); traindat(traindat=='A')=0; traindat(traindat=='C')=1; traindat(traindat=='G')=2; traindat(traindat=='T')=3; traindat=uint8(traindat); testdat=uint8(traindat); clear sg sg('svm_use_bias', 0); sg('svm_epsilon', epsilon); sg('set_labels','TRAIN', trainlab); sg('set_features', 'TRAIN', traindat, 'RAWDNA','WSPEC', order, order-1, normalize); sg('new_classifier', 'SVMOCAS'); sg('train_classifier'); [bias_ocas, alphas_ocas]=sg('get_classifier'); sg('set_features', 'TEST', testdat, 'RAWDNA','WSPEC', order, order-1, normalize); out=sg('classify'); fprintf('max out diff %g\n', max(abs(out-out_ref))) fprintf('max out diff %g\n', max(abs(out-out_ref2))) max(abs(svmw(1:length(alphas_ocas))-alphas_ocas')) %o=[]; %for i=1:length(feat), % o(i)=alphas_ocas*feat{i}; %end
%generate some toy data acgt='ACGT'; dat={acgt([1*ones(1,10) 2*ones(1,10) 3*ones(1,10) 4*ones(1,10) 1])}; sg('set_features', 'TRAIN', dat, 'DNA', 'slide_window', 5, 1); f=sg('get_features', 'TRAIN') sg('set_features', 'TRAIN', dat, 'DNA', 'from_position_list',5, int32([0,1,2,5,15,25,30,36])); f=sg('get_features', 'TRAIN') sg('set_features', 'TEST', dat, 'DNA', 'from_position_list',5, int32([0,1,2,5,15,25,30,36])); ft=sg('get_features', 'TEST') C=1; order=20; order_com=5; max_mismatch=0; len=200; shift=0; num=100; num_test=5000; cache=10; normalize=true; mkl_stepsize=1; block=0; single_degree=-1; sg('set_kernel', 'WEIGHTEDDEGREE', 'STRING', cache, order, max_mismatch, normalize, mkl_stepsize, block, single_degree); km=sg('get_kernel_matrix', 'TRAIN') sg('clean_features', 'TRAIN'); sg('clean_features', 'TEST'); sg('set_features', 'TRAIN', dat, 'DNA', 'from_position_list',5, int32([0,1,2,5,15,25,30]+5)); sg('set_features', 'TRAIN', dat, 'DNA', 'from_position_list',5, int32([0,1,2,5,15,25]+9)); sg('clean_features', 'TRAIN');
size_cache=10; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); width=1.4; % CHI2 disp('Chi2'); sg('set_kernel', 'CHI2', 'REAL', size_cache, width); sg('set_features', 'TRAIN', fm_train_real); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Combined disp('Combined'); sg('clean_features','TRAIN'); sg('clean_features','TEST'); sg('set_kernel', 'COMBINED', size_cache); sg('add_kernel', 1, 'LINEAR', 'REAL', size_cache); sg('add_features', 'TRAIN', fm_train_real); sg('add_features', 'TEST', fm_test_real); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', size_cache, 1); sg('add_features', 'TRAIN', fm_train_real); sg('add_features', 'TEST', fm_test_real); sg('add_kernel', 1, 'POLY', 'REAL', size_cache, 3, false); sg('add_features', 'TRAIN', fm_train_real); sg('add_features', 'TEST', fm_test_real); km=sg('get_kernel_matrix', 'TRAIN'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; order=30; gap=0; reverse='n'; use_sign=0; normalization='FULL'; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); % Comm Ulong String disp('CommUlongString'); sg('add_preproc', 'SORTULONGSTRING'); sg('set_kernel', 'COMMSTRING', 'ULONG', size_cache, use_sign, normalization); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse); sg('attach_preproc', 'TRAIN'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse); sg('attach_preproc', 'TEST'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; order=7; gap=0; reverse='n'; use_sign=0; normalization='FULL'; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); % Comm Word String disp('CommWordString'); sg('add_preproc', 'SORTWORDSTRING'); sg('set_kernel', 'COMMSTRING', 'WORD', size_cache, use_sign, normalization); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('attach_preproc', 'TRAIN'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('attach_preproc', 'TEST'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Const disp('Const'); c=23; sg('set_kernel', 'CONST', 'REAL', size_cache, c); sg('set_features', 'TRAIN', fm_train_real); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real); km=sg('get_kernel_matrix', 'TEST');
truth = sign(2*rand(1,60) - 1); km=rand(length(truth)); km=km+km'; sg('set_kernel', 'CUSTOM', km, 'FULL'); sg('set_labels', 'TRAIN', truth); sg('new_classifier', 'LIBSVM'); sg('train_classifier'); out_all = sg('classify'); out = sg('classify_example',0);
size_cache=10; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Diag disp('Diag'); diag=23.; sg('set_kernel', 'DIAG', 'REAL', size_cache, diag); sg('set_features', 'TRAIN', fm_train_real); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Distance disp('Distance'); width=1.7; sg('set_distance', 'EUCLIDIAN', 'REAL'); sg('set_kernel', 'DISTANCE', size_cache, width); sg('set_features', 'TRAIN', fm_train_real); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); % Fixed Degree String disp('FixedDegreeString'); degree=3; sg('set_kernel', 'FIXEDDEGREE', 'CHAR', size_cache, degree); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; width=2.1; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Gaussian disp('Gaussian'); sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width); sg('set_features', 'TRAIN', fm_train_real); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; width=1.0; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % GaussianShift disp('GaussianShift'); max_shift=2; shift_step=1; sg('set_kernel', 'GAUSSIANSHIFT', 'REAL', size_cache, width, max_shift, shift_step); sg('set_features', 'TRAIN', fm_train_real); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; order=3; gap=0; reverse='n'; addpath('tools'); label_train_dna=load_matrix('../data/label_train_dna.dat'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); % Plugin Estimate disp('PluginEstimate w/ HistogramWord'); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('set_features', 'TEST', fm_test_dna, 'DNA'); sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); pseudo_pos=1e-1; pseudo_neg=1e-1; sg('new_plugin_estimator', pseudo_pos, pseudo_neg); sg('set_labels', 'TRAIN', label_train_dna); sg('train_estimator'); sg('set_kernel', 'HISTOGRAM', 'WORD', size_cache); km=sg('get_kernel_matrix', 'TRAIN'); % not supported yet; % lab=sg('plugin_estimate_classify'); km=sg('get_kernel_matrix', 'TEST');
rand('seed',17); %sequence lengths, number of sequences len=200; num_train=500; num_test=500; num_a=2; aa=(round(len/2-num_a/2)):(round(len/2+num_a/2-1)); %SVM regularization factor C C=1; %locality improved kernel parameters cache=100; l=3; d1=4; d2=1; %generate some toy data acgt='ACGT'; rand('state',1); traindat=acgt(ceil(4*rand(len,num_train))); trainlab=[-ones(1,num_train/2),ones(1,num_train/2)]; traindat(aa,trainlab==1)='A'; testdat=acgt(ceil(4*rand(len,num_test))); testlab=[-ones(1,num_test/2),ones(1,num_test/2)]; testdat(aa,testlab==1)='A'; %traindat' %input('key to continue') %train svm sg('set_features', 'TRAIN', traindat, 'DNA'); sg('set_labels', 'TRAIN', trainlab); sg('set_kernel', 'SLIK', 'CHAR', cache, l, d1, d2); sg('new_classifier', 'LIBSVM'); sg('c', C); tic;sg('train_classifier');toc; %evaluate svm on test data sg('set_features', 'TEST', testdat, 'DNA'); sg('set_labels', 'TEST', testlab); out1=sg('classify'); fprintf('accuracy: %f \n', mean(sign(out1)==testlab)) out2=sg('classify'); fprintf('accuracy: %f \n', mean(sign(out2)==testlab)) tic;out3=sg('classify');toc; fprintf('accuracy: %f \n', mean(sign(out3)==testlab)) max(abs(out1-out2)) max(abs(out1-out3))
size_cache=10; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Linear disp('Linear'); scale=1.2; sg('set_kernel', 'LINEAR', 'REAL', size_cache, scale); sg('set_features', 'TRAIN', fm_train_real); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_byte=uint8(load_matrix('../data/fm_train_byte.dat')); fm_test_byte=uint8(load_matrix('../data/fm_test_byte.dat')); % LinearByte is b0rked disp('LinearByte'); sg('set_kernel', 'LINEAR', 'BYTE', size_cache); sg('set_features', 'TRAIN', fm_train_byte, 'RAWBYTE'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_byte, 'RAWBYTE'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); % Linear String disp('LinearString'); sg('set_kernel', 'LINEAR', 'CHAR', size_cache); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_word=uint16(load_matrix('../data/fm_train_word.dat')); fm_test_word=uint16(load_matrix('../data/fm_test_word.dat')); % LinearWord disp('LinearWord'); scale=1.4; sg('set_kernel', 'LINEAR', 'WORD', size_cache, scale); sg('set_features', 'TRAIN', fm_train_word); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_word); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); % Local Alignment String disp('LocalAlignmentString'); sg('set_kernel', 'LOCALALIGNMENT', 'CHAR', size_cache); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); % Locality Improved String disp('LocalityImprovedString'); length=5; inner_degree=5; outer_degree=inner_degree+2; sg('set_kernel', 'LIK', 'CHAR', size_cache, length, inner_degree, outer_degree); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); % Oligo String k=3; w=1.2; sg('set_kernel', 'OLIGO', 'CHAR', size_cache, k, w); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Poly disp('Poly'); degree=4; inhomogene=false; use_normalization=true; sg('set_kernel', 'POLY', 'REAL', size_cache, degree, inhomogene, use_normalization); sg('set_features', 'TRAIN', fm_train_real); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); % Poly Match String disp('PolyMatchString'); degree=3; inhomogene=false; sg('set_kernel', 'POLYMATCH', 'CHAR', size_cache, degree, inhomogene); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); order=3; gap=0; reverse='n'; % bit silly to not use boolean, set 'r' to yield true use_sign=false; normalization='FULL'; % Poly Match WordString disp('PolyMatchWordString'); degree=2; inhomogene=true; sg('set_kernel', 'POLYMATCH', 'WORD', size_cache, degree, inhomogene); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % sigmoid disp('Sigmoid'); gamma=1.2; coef0=1.3; sg('set_kernel', 'SIGMOID', 'REAL', size_cache, gamma, coef0); sg('set_features', 'TRAIN', fm_train_real); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); % Simple Locality Improved String disp('SimpleLocalityImprovedString'); length=5; inner_degree=5; outer_degree=inner_degree+2; sg('set_kernel', 'SLIK', 'CHAR', size_cache, length, inner_degree, outer_degree); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Sparse Gaussian disp('SparseGaussian'); width=1.3; sg('set_kernel', 'GAUSSIAN', 'SPARSEREAL', size_cache, width); sg('set_features', 'TRAIN', sparse(fm_train_real)); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', sparse(fm_test_real)); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Sparse Linear disp('SparseLinear'); scale=1.3; sg('set_kernel', 'LINEAR', 'SPARSEREAL', size_cache, scale); sg('set_features', 'TRAIN', sparse(fm_train_real)); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', sparse(fm_test_real)); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % Sparse Poly disp('SparsePoly'); degree=3; inhomogene=true; use_normalization=false; sg('set_kernel', 'POLY', 'SPARSEREAL', size_cache, degree, inhomogene, use_normalization); sg('set_features', 'TRAIN', sparse(fm_train_real)); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', sparse(fm_test_real)); km=sg('get_kernel_matrix', 'TEST');
rand('seed',17); %sequence lengths, number of sequences len=100; num_train=1000; num_test=5000; num_a=5; aa=(round(len/2-num_a/2)):(round(len/2+num_a/2-1)); %SVM regularization factor C C=1; %Spectrum kernel parameters order=5; cache=10; use_sign=true; normalization='FULL'; %NO,SQRT,LEN,SQLEN,FULL %generate some toy data acgt='ACGT'; shift=40; rand('state',1); traindat=acgt(ceil(4*rand(len,num_train))); trainlab=[-ones(1,num_train/2),ones(1,num_train/2)]; aas=floor((shift+1)*rand(num_train,1)); idx=find(trainlab==1); for i=1:length(idx), traindat(aa+aas(i),idx(i))='A'; end testdat=acgt(ceil(4*rand(len,num_test))); testlab=[-ones(1,num_test/2),ones(1,num_test/2)]; aas=floor((shift+1)*rand(num_test,1)); idx=find(testlab==1); for i=1:length(idx), testdat(aa+aas(i),idx(i))='A'; end %traindat' %input('key to continue') %train svm sg('use_linadd', true); sg('set_features', 'TRAIN', traindat, 'DNA'); sg('set_labels', 'TRAIN', trainlab); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1); sg('add_preproc', 'SORTWORDSTRING'); sg('attach_preproc', 'TRAIN'); sg('set_kernel', 'COMMSTRING', 'WORD', cache, use_sign, normalization); sg('new_classifier', 'SVMLIGHT'); sg('c', C); sg('train_classifier'); sg('init_kernel_optimization'); %evaluate svm on train data sg('set_features', 'TEST', traindat, 'DNA'); sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1); sg('attach_preproc', 'TEST'); sg('set_labels', 'TEST', trainlab); out=sg('classify'); fprintf('accuracy: %f \n', mean(sign(out)==trainlab)) %evaluate svm on test data sg('set_features', 'TEST', testdat, 'DNA'); sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1); sg('attach_preproc', 'TEST'); sg('set_labels', 'TEST', testlab); out=sg('classify'); fprintf('accuracy: %f \n', mean(sign(out)==testlab))
rand('seed',17); %sequence lengths, number of sequences len=100; num_train=10; num_a=5; aa=(round(len/2-num_a/2)):(round(len/2+num_a/2-1)); %SVM regularization factor C C=1; %Spectrum kernel parameters order=8; cache=10; use_sign=false; normalization='NO'; %NO,SQRT,LEN,SQLEN,FULL %generate some toy data acgt='ACGT'; shift=40; rand('state',1); traindat=acgt(ceil(4*rand(len,num_train))); trainlab=[-ones(1,num_train/2),ones(1,num_train/2)]; aas=floor((shift+1)*rand(num_train,1)); idx=find(trainlab==1); for i=1:length(idx), traindat(aa+aas(i),idx(i))='A'; end %%% spec weights=(order:-1:1); weights=weights/sum(weights); km=zeros(size(traindat,2)); for o=1:order, sg('set_features', 'TRAIN', traindat, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', o, order-1); sg('add_preproc', 'SORTWORDSTRING'); sg('attach_preproc', 'TRAIN'); sg('set_kernel', 'COMMSTRING', 'WORD',cache, use_sign, normalization); km=km+weights(o)*sg('get_kernel_matrix', 'TRAIN'); end %%% wdspec sg('set_features', 'TRAIN', traindat, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, 0, 'r'); sg('add_preproc', 'SORTWORDSTRING'); sg('attach_preproc', 'TRAIN'); sg('set_kernel', 'WEIGHTEDCOMMSTRING', 'WORD', cache, use_sign, normalization); wkm=sg('get_kernel_matrix', 'TRAIN'); max(abs(wkm(:)-km(:)))
size_cache=10; use_sign=0; reverse='r'; order=8; gap=0; normalization='FULL'; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); % Weighted Comm Word String disp('WeightedCommWordString'); sg('add_preproc', 'SORTWORDSTRING'); sg('set_kernel', 'WEIGHTEDCOMMSTRING', 'WORD', size_cache, use_sign, normalization); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('attach_preproc', 'TRAIN'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('attach_preproc', 'TEST'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); % Weighted Degree Position String disp('WeightedDegreePositionString'); degree=20; sg('set_kernel', 'WEIGHTEDDEGREEPOS', 'CHAR', size_cache, degree); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); % Weighted Degree String disp('WeightedDegreeString'); degree=20; sg('set_kernel', 'WEIGHTEDDEGREE', 'CHAR', size_cache, degree); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); km=sg('get_kernel_matrix', 'TEST');
% This script should enable you to rerun the experiment in the % paper that we labeled with "christmas star". % % The task is to classify two star-shaped classes that share the % midpoint. The difficulty of the learning problem depends on the % distance between the classes, which is varied % % Our model selection leads to a choice of C = 0.5. The model % selection is not repeated inside this script. % Preliminary settings: C = 0.5; % SVM Parameter cache_size = 50; % cache per kernel in MB svm_eps=1e-3; % svm epsilon mkl_eps=1e-3; % mkl epsilon no_obs = 50; % number of observations / data points (sum for train and test and both classes) % 2000 was used in the paper k_star = 20; % number of "leaves" of the stars alpha = 0.3; % noise level of the data radius_star(:,1) = [4.1:0.2:10]'; % increasing radius of the 1.class radius_star(:,2) = 4*ones(length(radius_star(:,1)),1); % fixed radius 2.class % distanz between the classes: diff(radius_star(:,1)-radius_star(:,2)) rbf_width = [0.01 0.1 1 10 100]; % different width for the five used rbf kernels mkl_norm = 1; % >=1 rand('state', 17); randn('state', 17); %%%% %%%% Great loop: train MKL for every data set (the different distances between the stars) %%%% %sg('loglevel', 'ALL'); %sg('echo', 'ON'); for kk = 1:size(radius_star,1) % data generation fprintf('MKL for radius %+02.2f \n', radius_star(kk,1)) dummy(1,:) = rand(1,4*no_obs); noise = alpha*randn(1,4*no_obs); dummy(2,:) = sin(k_star*pi*dummy(1,:)) + noise; % sine dummy(2,1:2*no_obs) = dummy(2,1:2*no_obs)+ radius_star(kk,1); % distanz shift: first class dummy(2,(2*no_obs+1):end) = dummy(2,(2*no_obs+1):end)+ radius_star(kk,2); % distanz shift: second class dummy(1,: ) = 2*pi*dummy(1,:); x(1,:) = dummy(2,:).*sin(dummy(1,:)); x(2,:) = dummy(2,:).*cos(dummy(1,:)); train_y = [-ones(1,no_obs) ones(1,no_obs)]; test_y = [-ones(1,no_obs) ones(1,no_obs)]; train_x = x(:,1:2:end); test_x = x(:,2:2:end); clear dummy x; % train MKL sg('clean_kernel'); sg('clean_features', 'TRAIN'); sg('add_features','TRAIN', train_x); % set a trainingset for every SVM sg('add_features','TRAIN', train_x); sg('add_features','TRAIN', train_x); sg('add_features','TRAIN', train_x); sg('add_features','TRAIN', train_x); sg('set_labels','TRAIN', train_y); % set the labels sg('new_classifier', 'MKL_CLASSIFICATION'); sg('mkl_use_interleaved_optimization', 1); % 0, 1 sg('set_solver', 'DIRECT'); % DIRECT, NEWTON, CPLEX, AUTO, GLPK %sg('set_constraint_generator', 'LIBSVM'); sg('mkl_parameters', mkl_eps, 0, mkl_norm); sg('svm_epsilon', svm_eps); sg('set_kernel', 'COMBINED', 0); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(1)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(2)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(3)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(4)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(5)); sg('c', C); sg('train_classifier'); [b,alphas]=sg('get_svm') ; w(kk,:) = sg('get_subkernel_weights'); % calculate train error sg('clean_features', 'TEST'); sg('add_features','TEST',train_x); sg('add_features','TEST',train_x); sg('add_features','TEST',train_x); sg('add_features','TEST',train_x); sg('add_features','TEST',train_x); sg('set_labels','TEST', train_y); sg('set_threshold', 0); result.trainout(kk,:)=sg('classify'); result.trainerr(kk) = mean(train_y~=sign(result.trainout(kk,:)),2); % calculate test error sg('clean_features', 'TEST'); sg('add_features','TEST',test_x); sg('add_features','TEST',test_x); sg('add_features','TEST',test_x); sg('add_features','TEST',test_x); sg('add_features','TEST',test_x); sg('set_labels','TEST',test_y); sg('set_threshold', 0); result.testout(kk,:)=sg('classify'); result.testerr(kk) = mean(test_y~=sign(result.testout(kk,:)),2); end disp('done. now w contains the kernel weightings and result test/train outputs and errors')
% Explicit examples on how to use the different classifiers size_cache=10; C=1.2; use_bias=false; epsilon=1e-5; width=1.2; mkl_eps=0.001; mkl_norm=1; % only L1 by now max_train_time=600; addpath('tools'); label_train_multiclass=load_matrix('../data/label_train_multiclass.dat'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); % MKL_MULTICLASS disp('MKL_MULTICLASS'); sg('new_classifier', 'MKL_MULTICLASS'); disp('Combined'); sg('clean_kernel'); sg('clean_features','TRAIN'); sg('clean_features','TEST'); sg('set_kernel', 'COMBINED', size_cache); sg('add_kernel', 1, 'LINEAR', 'REAL', size_cache); sg('add_features', 'TRAIN', fm_train_real); sg('add_features', 'TEST', fm_test_real); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', size_cache, 1); sg('add_features', 'TRAIN', fm_train_real); sg('add_features', 'TEST', fm_test_real); sg('add_kernel', 1, 'POLY', 'REAL', size_cache, 2); sg('add_features', 'TRAIN', fm_train_real); sg('add_features', 'TEST', fm_test_real); sg('set_labels', 'TRAIN', label_train_multiclass); sg('svm_epsilon', epsilon); sg('c', C); sg('mkl_parameters', mkl_eps, 0, mkl_norm); sg('train_classifier'); result=sg('classify'); result
% This script should enable you to rerun the experiment in the % paper that we labeled "mixture linear and sine ". % % The task is to learn a regression function where the true function % is given by a mixture of 2 sine waves in addition to a linear trend. % We vary the frequency of the second higher frequency sine wave. % Setup: MKL on 10 RBF kernels of different widths on 1000 examples % Preliminary setting % kernel width for 10 basic SVMs rbf_width(1) = 0.001; rbf_width(2) = 0.005; rbf_width(3) = 0.01; rbf_width(4) = 0.05; rbf_width(5) = 0.1; rbf_width(6) = 1; rbf_width(7) = 10; rbf_width(8) = 50; rbf_width(9) = 100; rbf_width(10) = 1000; mkl_norm = 1; % >=1 % SVM parameter C = 1; cache_size = 50; mkl_eps = 1e-4; svm_eps = 1e-4; svr_tube = 0.01; debug = 0; % data f = [0:20]; % parameter that varies the frequency of the second sine wave no_obs = 20; % number of observations if debug sg('loglevel', 'ALL'); sg('echo', 'ON'); else sg('loglevel', 'ERROR'); sg('echo', 'OFF'); end for kk = 1:length(f) % Big loop % data generation train_x = [0:((4*pi)/(no_obs-1)):4*pi]; trend = 2 * train_x* ((pi)/(max(train_x)-min(train_x))); wave1 = sin(train_x); wave2 = sin(f(kk)*train_x); train_y = trend + wave1 + wave2; % MKL learning kernels={}; sg('new_classifier', 'MKL_REGRESSION'); sg('mkl_parameters', mkl_eps, 0, mkl_norm); sg('mkl_use_interleaved_optimization', 1); % 0, 1 sg('set_solver', 'DIRECT'); % DIRECT, NEWTON, CPLEX, AUTO, GLPK sg('c', C); sg('svm_epsilon',svm_eps); sg('svr_tube_epsilon',svr_tube); sg('clean_features', 'TRAIN'); sg('clean_kernel'); sg('set_labels', 'TRAIN', train_y); % set labels sg('add_features','TRAIN', train_x); % add features for every basic SVM sg('add_features','TRAIN', train_x); sg('add_features','TRAIN', train_x); sg('add_features','TRAIN', train_x); sg('add_features','TRAIN', train_x); sg('add_features','TRAIN', train_x); sg('add_features','TRAIN', train_x); sg('add_features','TRAIN', train_x); sg('add_features','TRAIN', train_x); sg('add_features','TRAIN', train_x); sg('set_kernel', 'COMBINED', 0); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(1)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(2)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(3)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(4)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(5)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(6)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(7)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(8)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(9)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(10)); sg('train_regression'); weights(kk,:) = sg('get_subkernel_weights') ; fprintf('frequency: %02.2f rbf-kernel-weights: %02.2f %02.2f %02.2f %02.2f %02.2f %02.2f %02.2f %02.2f %02.2f %02.2f \n', f(kk), weights(kk,:)) end
% This script should enable you to rerun the experiment in the % paper that we labeled "sine". % % In this regression task a sine wave is to be learned. % We vary the frequency of the wave. % Preliminary settings: % Parameter for the SVMs. C = 10; % obtained via model selection (not included in the script) cache_size = 10; mkl_eps = 1e-4; % threshold for precision svm_eps = 1e-4; svr_tube_eps = 1e-3; debug = 0; % Kernel width for the 5 "basic" SVMs rbf_width(1) = 0.005; rbf_width(2) = 0.05; rbf_width(3) = 0.5; rbf_width(4) = 1; rbf_width(5) = 10; mkl_norm = 1; % >=1 % data f = [0.1:0.2:5]; % values for the different frequencies no_obs = 100; % number of observations if debug sg('loglevel', 'ALL'); sg('echo', 'ON'); else sg('loglevel', 'ERROR'); sg('echo', 'OFF'); end for kk = 1:length(f) % big loop for the different learning problems % data generation train_x = [1:(((10*2*pi)-1)/(no_obs-1)):10*2*pi]; train_y = sin(f(kk)*train_x); kernels={}; % initialize MKL-SVR sg('new_regression', 'MKL_REGRESSION'); sg('mkl_parameters', mkl_eps, 0, mkl_norm); sg('mkl_use_interleaved_optimization', 1); % 0, 1 sg('set_solver', 'GLPK'); % DIRECT, NEWTON, CPLEX, AUTO, GLPK sg('c', C); sg('svm_epsilon', svm_eps); sg('svr_tube_epsilon', svr_tube_eps); sg('clean_features', 'TRAIN'); sg('clean_kernel'); sg('set_labels', 'TRAIN', train_y); % set labels sg('add_features','TRAIN', train_x); % add features for every SVR sg('add_features','TRAIN', train_x); sg('add_features','TRAIN', train_x); sg('add_features','TRAIN', train_x); sg('add_features','TRAIN', train_x); sg('set_kernel', 'COMBINED', 0); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(1)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(2)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(3)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(4)); sg('add_kernel', 1, 'GAUSSIAN', 'REAL', cache_size, rbf_width(5)); sg('train_regression'); weights(kk,:) = sg('get_subkernel_weights') ; fprintf('frequency: %02.2f rbf-kernel-weights: %02.2f %02.2f %02.2f %02.2f %02.2f \n', f(kk), weights(kk,:)) end
size_cache=10; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); width=1.4; % LogPlusOne disp('LogPlusOne'); sg('add_preproc', 'LOGPLUSONE'); sg('set_kernel', 'CHI2', 'REAL', size_cache, width); sg('set_features', 'TRAIN', fm_train_real); sg('attach_preproc', 'TRAIN'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real); sg('attach_preproc', 'TEST'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); width=1.4; % NormOne disp('NormOne'); sg('add_preproc', 'NORMONE'); sg('set_kernel', 'CHI2', 'REAL', size_cache, width); sg('set_features', 'TRAIN', fm_train_real); sg('attach_preproc', 'TRAIN'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real); sg('attach_preproc', 'TEST'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_real=load_matrix('../data/fm_train_real.dat'); fm_test_real=load_matrix('../data/fm_test_real.dat'); width=1.4; % PruneVarSubMean disp('PruneVarSubMean'); divide_by_std=true; sg('add_preproc', 'PRUNEVARSUBMEAN', divide_by_std); sg('set_kernel', 'CHI2', 'REAL', size_cache, width); sg('set_features', 'TRAIN', fm_train_real); sg('attach_preproc', 'TRAIN'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_real); sg('attach_preproc', 'TEST'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); width=1.4; % % complex string features; % order=3; gap=0; reverse='n'; % bit silly to not use boolean, set 'r' to yield true use_sign=false; normalization='FULL'; % SortUlongString disp('CommUlongString'); sg('add_preproc', 'SORTULONGSTRING'); sg('set_kernel', 'COMMSTRING', 'ULONG', size_cache, use_sign, normalization); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse); sg('attach_preproc', 'TRAIN'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'ULONG', order, order-1, gap, reverse); sg('attach_preproc', 'TEST'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; addpath('tools'); fm_train_dna=load_matrix('../data/fm_train_dna.dat'); fm_test_dna=load_matrix('../data/fm_test_dna.dat'); width=1.4; order=3; gap=0; reverse='n'; use_sign=false; normalization='FULL'; % SortWordString disp('CommWordString'); sg('add_preproc', 'SORTWORDSTRING'); sg('set_kernel', 'COMMSTRING', 'WORD', size_cache, use_sign, normalization); sg('set_features', 'TRAIN', fm_train_dna, 'DNA'); sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('attach_preproc', 'TRAIN'); km=sg('get_kernel_matrix', 'TRAIN'); sg('set_features', 'TEST', fm_test_dna, 'DNA'); sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse); sg('attach_preproc', 'TEST'); km=sg('get_kernel_matrix', 'TEST');
size_cache=10; width=2.1; C=1.2; tube_epsilon=1e-2; addpath('tools'); label_train=load_matrix('../data/label_train_twoclass.dat'); fm_train=load_matrix('../data/fm_train_real.dat'); fm_test=load_matrix('../data/fm_test_real.dat'); % KRR disp('KRR'); tau=1.2; sg('set_features', 'TRAIN', fm_train); sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width); sg('set_labels', 'TRAIN', label_train); sg('new_regression', 'KRR'); sg('krr_tau', tau); sg('c', C); sg('train_regression'); sg('set_features', 'TEST', fm_test); result=sg('classify');
size_cache=10; width=2.1; C=1.2; tube_epsilon=1e-2; addpath('tools'); label_train=load_matrix('../data/label_train_twoclass.dat'); fm_train=load_matrix('../data/fm_train_real.dat'); fm_test=load_matrix('../data/fm_test_real.dat'); % LibSVR disp('LibSVR'); sg('set_features', 'TRAIN', fm_train); sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width); sg('set_labels', 'TRAIN', label_train); sg('new_regression', 'LIBSVR'); sg('svr_tube_epsilon', tube_epsilon); sg('c', C); sg('train_regression'); sg('set_features', 'TEST', fm_test); result=sg('classify');
size_cache=10; width=2.1; C=1.2; tube_epsilon=1e-2; addpath('tools'); label_train=load_matrix('../data/label_train_twoclass.dat'); fm_train=load_matrix('../data/fm_train_real.dat'); fm_test=load_matrix('../data/fm_test_real.dat'); % SVR Light try disp('SVRLight'); sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width); sg('set_features', 'TRAIN', fm_train); sg('set_labels', 'TRAIN', label_train); sg('new_regression', 'SVRLIGHT'); sg('svr_tube_epsilon', tube_epsilon); sg('c', C); sg('train_regression'); sg('set_features', 'TEST', fm_test); result=sg('classify'); catch disp('No support for SVRLight available.') end
%% load data %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% load('-mat', '../data/DynProg_example.dat') %% set a number of defaults %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% use_orf = 1; num_svms = 8; use_long_transitions = 1; threshold = 1000; long_transition_max_len = 100000; block.content_pred(end+1:num_svms,:) = deal(0); viterbi_nbest = [1 0] ; %% reshape the training parameters and additional information like %% length constraints and transformation type and pass them to shogun %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% for j=1:length(penalty_array) all_ids(j) = penalty_array{j}.id; all_names{j} = penalty_array{j}.name; all_limits(:,j) = penalty_array{j}.limits; all_penalties(:,j) = penalty_array{j}.penalties; if isempty(penalty_array{j}.transform) all_transform{j} = 'linear'; else all_transform{j} = penalty_array{j}.transform; end all_min_values(j) = penalty_array{j}.min_value; all_max_values(j) = penalty_array{j}.max_value; all_use_cache(j) = penalty_array{j}.use_cache; all_use_svm(j) = penalty_array{j}.use_svm; all_do_calc(j) = 1; end sg('set_plif_struct',int32(all_ids)-1,all_names, all_limits, all_penalties, all_transform,... all_min_values, all_max_values, int32(all_use_cache), int32(all_use_svm), int32(all_do_calc)); %% pass the data to shogun %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% sg('init_dyn_prog', num_svms) sg('set_lin_feat', block.seq, int32(block.all_pos-1), block.content_pred); sg('set_model', model.transition_pointers, use_orf, int32(model.mod_words), int32(state_signals),int32(model.orf_info)) sg('set_feature_matrix', block.features) sg('long_transition_settings', use_long_transitions, threshold, long_transition_max_len) %% run the dynamic program %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% [path_scores, path, ppos]= sg('best_path_trans', model.p', model.q', int32(viterbi_nbest), seg_path, a_trans, loss);