function collected = hlp_collect_datasets(directory,varargin)
% Find all loadable datasets in some directory tree and collect arbitary properties.
% Collected = hlp_collect_datasets(Directory, Options...)
%
% In:
%   Directory    : the directory to be searched for eeg datasets
%
%   Options...   : optional name-value pairs; possible names are:
%                   'pattern': regular expression pattern of file paths considered (default: regexptranslate('wildcard','*.set'))
%                   'nopattern': cell array of regular expression patterns (for files/directories) to exclude from scan (default: {})
%                   'checkset': whether to run an eeg_checkset on the data (default: 1)
%                   'nowarnings': exclude files that give a warning when loading or running eeg_checkset (default: 1)
%                   'nodialogs': exclude files that create dialogs when loading or running eeg_checkset (default: 1)
%                   'conditions': cell array of functions that check custom conditions on the data (e.g., @(EEG) ~isempty(EEG.icawinv)) (default: [])
%                   'maxsize': maximum filesize considered, in bytes (default: 5*2^20 = 5MB)
%                   'maxtime': maximum allowed file processing time, in seconds (default: Inf)
%                   'maxnumber': maximum number of entries returned (default: Inf)
%                   'collect': a function of the EEG set (and optionally file path); the function's return values are collected
%                              for every admissible dataset (default: @(EEG,path) path)
%                   'fileconditions': cell array of functions that check custom conditions on the filename (e.g. @(path,name,ext) exist([path filesep name '.xxx'],'file'))
%                                     (default: @(path,name,ext) exist([path filesep name '.fdt'],'file') || exist([path filesep name '.dat'],'file'))
% 
% Out:
%   Collected : cell array of admissible file paths (or desired contents, if 'collect' was specified)
%
% Examples:
%   % collect ICA maps and chanlocs of all datasets for which more than half of the channels have locations:
%   collected = hlp_collect_datasets('/data/projects', 'nopattern',{'christian','duann','phase_tunnel'}, 'conditions',@(EEG) ~isempty(EEG.icawinv) && mean(cellfun('isempty',{EEG.chanlocs.X})) < 0.5, 'collect',@(EEG,path){path,EEG.icawinv,EEG.chanlocs,EEG.icachansind})
%
% Notes:
%   If the function is terminated prematurely, the global variables collected_so_far and num_collected_so_var give the current data:
%   global collected_so_far num_collected_so_far; mydata = collected_so_far(1:num_collected_so_far)
%
%                               Laura Froelich, Christian Kothe, Swartz Center for Computational Neuroscience, UCSD
%                               2010-09-10

global collected_so_far num_collected_so_far;                                % collected_so_far(1:numcollected_so_far) is what we have so far...  
    
if length(varargin)==1 && isstruct(varargin{1})
    opts = varargin{1};                                                     % opts is already a struct: fast path
else
    opts = hlp_varargin2struct(varargin, 'pattern',regexptranslate('wildcard','*.set'), 'nopattern',{}, 'checkset',1, 'nowarnings',1 ,'nodialogs',1, ...
        'maxsize',5*2^20, 'maxtime',Inf, 'maxnumber',Inf, 'conditions',[], 'collect',@(EEG,path) path, ....
        'fileconditions',@(path,name,ext) exist([path filesep  name '.fdt'],'file') || exist([path filesep name '.dat'],'file'));
    if ~iscell(opts.conditions)
        opts.conditions = {opts.conditions}; end
    if ~iscell(opts.nopattern)
        opts.nopattern = {opts.nopattern}; end
    if ~iscell(opts.fileconditions)
        opts.fileconditions = {opts.fileconditions}; end
    if opts.nodialogs
        addpath(fileparts(mfilename('fullpath')));                          % need to be able to call this recursively after we cd'd to another path
        olddir = pwd;
        cd([fileparts(mfilename('fullpath')) filesep 'private' filesep 'dialogs_disabled']);
        go_back = onCleanup(@()cd(olddir)); %#ok<NASGU>
    end
    collected_so_far = {};
    num_collected_so_far = 0;
end

disp(['entering ' directory '...']);
collected = [];
topfiles = dir(directory);
topfiles = topfiles([topfiles.bytes] <= opts.maxsize);                      % discard too large files
for it = {topfiles.name}                                                    % for each admissible dir entry
    item = it{1};
    whole_path = [directory filesep item];
    if any(cellfun(@(x) ~isempty(regexp(whole_path,x,'once')), opts.nopattern)) % discard disallowed patterns
        disp(['skipping ' whole_path '...']);
        continue;
    end    
    if isdir(whole_path)
        if ~isempty(item) && item(1)~='.'                                   % discard self & parent paths, and hidden paths
            collected = [collected hlp_collect_datasets(whole_path, opts)]; end %#ok<AGROW> % recurse...
    elseif regexp(whole_path,opts.pattern)                                  % discard non-matching files        
        try
            fprintf(['testing ' whole_path '... ']);
            [path,name,ext] = fileparts(whole_path);
            for cond = opts.fileconditions                                  % check for file name conditions
                if cond{1}(path,name,ext)
                    % succeeded 
                else
                    error('file name condition violated');
                end
            end
            t0 = tic;                                                        % measure processing time
            [conout,data] = evalc(sprintf('pop_loadset(''filename'',''%s'', ''filepath'',''%s'', ''loadmode'',''info'', ''check'',''off'')',item,directory));
            if opts.nowarnings && ~isempty(strfind(lower(conout),'warning')) % discard files with load warnings
                error('loadset warning'); end
            if opts.checkset
                [conout,data] = evalc('eeg_checkset(data)');
                if opts.nowarnings && ~isempty(strfind(lower(conout),'warning')) % discard files with checkset warnings
                    error('checkset warning'); end
            end
            if toc(t0) >= opts.maxtime                                       % discard files that take too long to process
                error('processing time exceeded'); end
            for cond = opts.conditions                                       % check for additional conditions
                if cond{1}(data)
                    % succeeded
                else
                    error('dataset condition violated');
                end
            end
            if nargin(opts.collect) == 2                                    % collect properties
                selection = opts.collect(data,whole_path);
            else
                selection = opts.collect(data);
            end
            collected{end+1} = selection; %#ok<AGROW>
            num_collected_so_far = num_collected_so_far+1;
            if length(collected_so_far) < num_collected_so_far
                collected_so_far{1+2*end} = []; end                         % grow this array in ~ constant time
            collected_so_far{num_collected_so_far} = selection;             % track results globally, too
            disp('included.');
        catch e
            disp(['excluded: ' e.message]);
        end
        if length(collected) >= opts.maxnumber
            break; end
    end
end

