## Copyright (C) 2021 Stefano Guidoni <ilguido@users.sf.net>
##
## This file is part of the statistics package for GNU Octave.
##
## This program is free software; you can redistribute it and/or modify it under
## the terms of the GNU General Public License as published by the Free Software
## Foundation; either version 3 of the License, or (at your option) any later
## version.
##
## This program is distributed in the hope that it will be useful, but WITHOUT
## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
## details.
##
## You should have received a copy of the GNU General Public License along with
## this program; if not, see <http://www.gnu.org/licenses/>.

## -*- texinfo -*-
## @deftypefn  {statistics} {@var{T} =} cluster (@var{Z}, "Cutoff", @var{C})
## @deftypefnx {statistics} {@var{T} =} cluster (@var{Z}, "Cutoff", @var{C}, "Depth", @var{D})
## @deftypefnx {statistics} {@var{T} =} cluster (@var{Z}, "Cutoff", @var{C}, "Criterion", @var{criterion})
## @deftypefnx {statistics} {@var{T} =} cluster (@var{Z}, "MaxClust", @var{N})
##
## Define clusters from an agglomerative hierarchical cluster tree.
##
## Given a hierarchical cluster tree @var{Z} generated by the @code{linkage}
## function, @code{cluster} defines clusters, using a threshold value @var{C} to
## identify new clusters ('Cutoff') or according to a maximum number of desired
## clusters @var{N} ('MaxClust').
##
## @var{criterion} is used to choose the criterion for defining clusters, which
## can be either "inconsistent" (default) or "distance". When using
## "inconsistent", @code{cluster} compares the threshold value @var{C} to the
## inconsistency coefficient of each link; when using "distance", @code{cluster}
## compares the threshold value @var{C} to the height of each link.
## @var{D} is the depth used to evaluate the inconsistency coefficient, its
## default value is 2.
##
## @code{cluster} uses "distance" as a criterion for defining new clusters when
## it is used the 'MaxClust' method.
##
## @seealso{clusterdata, dendrogram, inconsistent, kmeans, linkage, pdist}
## @end deftypefn

function T = cluster (Z, opt, varargin)
  switch (lower (opt))
  ## check the input
    case "cutoff"
      if (nargin < 3)
        print_usage ();
      else
        C = varargin{1};
        D = 2;
        criterion = "inconsistent";
        if (nargin > 3)
          pair_index = 2;
          while (pair_index < (nargin - 2))
            switch (lower (varargin{pair_index}))
              case "depth"
                D = varargin{pair_index + 1};
              case "criterion"
                criterion = varargin{pair_index + 1};
              otherwise
                error ("cluster: unknown property %s", varargin{pair_index});
            endswitch
            pair_index += 2;
          endwhile
        endif
      endif
      if ((! (isscalar (C) || isvector (C))) || (C < 0))
        error ...
          (["cluster: C must be a positive scalar or a vector of positive"...
            "numbers"]);
      endif

    case "maxclust"
      if (nargin != 3)
        print_usage ();
      else
        N = varargin{1};
        C = [];
      endif
      if ((! (isscalar (N) || isvector (N))) || (N < 0))
        error ...
          (["cluster: N must be a positive number or a vector of positive"...
            "numbers"]);
      endif

    otherwise
      error ("cluster: unknown option %s", opt);
  endswitch

  if ((columns (Z) != 3) || (! isnumeric (Z)) ...
      (! (max (Z(end, 1:2)) == rows (Z) * 2)))
    error ("cluster: Z must be a matrix generated by the linkage function");
  endif

  ## number of observations
  n = rows (Z) + 1;

  ## vector of values used by the threshold check
  vThresholds = [];

  ## starting number of clusters
  nClusters = 1;

  ## the return value is the matrix T, constituted by one or more vector vT
  T = [];
  vT = zeros (1, n);

  ## main logic
  ## a few checks and computations before launching the recursive function
  switch (lower (opt))
    case "cutoff"
      switch (lower (criterion))
        case "inconsistent"
          vThresholds = inconsistent (Z, D)(:, 4);
        case "distance"
          vThresholds = Z(:, 3);
        otherwise
          error ("cluster: unkown criterion %s", criterion);
      endswitch
    case "maxclust"
      ## the MaxClust case can be regarded as a Cutoff case with distance
      ## criterion, where the threshold is set to the height of the highest node
      ## that allows us to have N different clusters
      vThresholds = Z(:, 3);

      ## let's build a vector with the apt threshold values
      for k = 1:length (N);
        if (N(k) > n)
          C(end+1) = 0;
        elseif (N(k) < 2)
          C(end+1) = Z(end, 3) + 1;
        else
          C(end+1) = Z((end + 2 - N(k)), 3);
        endif
      endfor
  endswitch

  for c_index = 1:length (C)
    cluster_cutoff_recursive (rows (Z), nClusters, c_index);
    T = [T; vT];
  endfor

  T = T';           # return value

  ## recursive function
  ## for each link check if the cutoff criteria (a threshold value) are met,
  ## then call recursively this function for every node below that;
  ## when we find a leaf, we add the index of its cluster to the return value
  function cluster_cutoff_recursive (index, cluster_number, c_index)

    vClusterNumber = [cluster_number, cluster_number];

    ## check the threshold value
    if (vThresholds(index) >= C(c_index))
      ## create a new cluster
      nClusters++;
      vClusterNumber(2) = nClusters;
    endif;

    ## go on, down the tree
    for j = 1:2
      if (Z(index,j) > n)
        new_index = Z(index,j) - n;
        cluster_cutoff_recursive (new_index, vClusterNumber(j), c_index);
      else
        ## if the next node is a leaf, add the index of its cluster to the
        ## result at the correct position, i.e. the leaf number;
        ## if leaf 14 belongs to cluster 3:
        ## vT(14) = 3;
        vT(Z(index,j)) = vClusterNumber(j);
      endif
    endfor
  endfunction

endfunction


## Test input validation
%!error cluster ()
%!error <Z must be .* generated by the linkage .*> cluster ([1 1], "Cutoff", 1)
%!error <unknown option .*> cluster ([1 2 1], "Bogus", 1)
%!error <C must be a positive scalar .*> cluster ([1 2 1], "Cutoff", -1)
%!error <unknown property .*> cluster ([1 2 1], "Cutoff", 1, "Bogus", 1)

## Test output
%!test
% X = [(randn (10, 2) * 0.25) + 1; (randn (10, 2) * 0.25) - 1];
% Z = linkage(X, "ward");
% T = [ones (10, 1); 2 * ones (10, 1)];
% assert (cluster (Z, "MaxClust", 2), T);

