/* wordsplit - a word splitter
   Copyright (C) 2009 Sergey Poznyakoff

   This program is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by the
   Free Software Foundation; either version 3 of the License, or (at your
   option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License along
   with this program. If not, see <http://www.gnu.org/licenses/>. */

#ifdef HAVE_CONFIG_H
# include <config.h>
#endif

#include <ctype.h>
#include <c-ctype.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <wordsplit.h>

#include <error.h>
#include <gettext.h>
#define _(msgid) gettext (msgid)
#include <xalloc.h>

#define isws(c) ((c)==' '||(c)=='\t'||(c)=='\n')
#define isdelim(c,delim) (strchr(delim,(c))!=NULL)

#define _ARGCV_WORD_SED_EXPR 0x10000
#define _ARGCV_WORD_MASK     0xf0000

#define ALLOC_INIT 128
#define ALLOC_INCR 128

static int
wordsplit_init (struct wordsplit *wsp, const char *input, size_t len,
		int flags)
{
  wsp->ws_flags = flags;
  if ((wsp->ws_flags & (WRDSF_NOVAR|WRDSF_NOCMD))
      != (WRDSF_NOVAR|WRDSF_NOCMD))
    {
      if (wsp->ws_flags & WRDSF_SHOWERR)
	error (0, 0,
	       _("variable expansion and command substitution "
		 "are not yet supported"));
      errno = EINVAL;
      return 1;
    }

  wsp->ws_input = input;
  wsp->ws_len = len;
  
  if (!(wsp->ws_flags & WRDSF_DOOFFS))
    wsp->ws_offs = 0;
  
  if (!(wsp->ws_flags & WRDSF_DELIM))
    wsp->ws_delim = " ";
  
  if (!(wsp->ws_flags & WRDSF_COMMENT))
    wsp->ws_comment = NULL;
  
  if (wsp->ws_flags & WRDSF_REUSE)
    {
      wsp->ws_wordn = wsp->ws_wordc + 1;
      if (!(wsp->ws_flags & WRDSF_APPEND))
	wsp->ws_wordc = 0;
    }
  else
    {
      wsp->ws_wordv = NULL;
      wsp->ws_wordc = 0;
      wsp->ws_wordn = 0;
    }
  if (wsp->ws_flags & WRDSF_DOOFFS)
    wsp->ws_wordn += wsp->ws_offs;
  
  wsp->ws_endp = 0;
  return 0;
}

static int
alloc_space (struct wordsplit *wsp)
{
  size_t offs = (wsp->ws_flags & WRDSF_DOOFFS) ? wsp->ws_offs : 0;
  char **ptr;
  size_t newalloc;
  
  if (wsp->ws_wordv == NULL)
    {
      newalloc = offs + ALLOC_INIT;
      ptr = calloc (newalloc, sizeof (ptr[0]));
    }
  else if (wsp->ws_wordn < offs + wsp->ws_wordc + 1)
    {
      newalloc = offs + wsp->ws_wordc + ALLOC_INCR;
      ptr = realloc (wsp->ws_wordv, newalloc * sizeof (ptr[0]));
    }
  else
    return 0;
  
  if (ptr)
    {
      wsp->ws_wordn = newalloc;
      wsp->ws_wordv = ptr;
    }
  else
    {
      if (wsp->ws_flags & WRDSF_ENOMEMABRT)
	xalloc_die ();
      else if (wsp->ws_flags & WRDSF_SHOWERR)
	error (0, 0, _("memory exhausted"));
      errno = ENOMEM;
      return 1;
    }
  return 0;
}

static int
skip_sed_expr(const char *command, size_t i, size_t len)
{
  int state;
	  
  do
    {
      int delim;

      if (command[i] == ';')
	i++;
      if (!(command[i] == 's' && i + 3 < len && c_ispunct(command[i+1])))
	break;
	  
      delim = command[++i];
      state = 1;
      for (i++; i < len; i++)
	{
	  if (state == 3)
	    {
	      if (command[i] == delim || !c_isalnum(command[i]))
		break;
	    }
	  else if (command[i] == '\\')
	    i++;
	  else if (command[i] == delim)
	    state++;
	}
    }
  while (state == 3 && i < len && command[i] == ';');
  return i;
}

static size_t
skip_delim (struct wordsplit *wsp)
{
  size_t start = wsp->ws_endp;
  if (wsp->ws_flags & WRDSF_SQUEEZE_DELIMS)
    {
      do
	start++;
      while (start < wsp->ws_len
	     && isdelim (wsp->ws_input[start], wsp->ws_delim));
      start--;
    }
  
  if (!(wsp->ws_flags & WRDSF_RETURN_DELIMS))
    start++;
  
  return start;
}

#define _WRDS_WORD 1
#define _WRDS_CONT 2

static int
scan_word (struct wordsplit *wsp, size_t *pstart, size_t *pend)
{
  size_t start = *pstart;
  size_t len = wsp->ws_len;
  const char *command = wsp->ws_input;
  const char *delim = wsp->ws_delim;
  const char *comment = wsp->ws_comment;
  
  size_t i = start;

  if (i >= len)
    return WRDSE_EOF;

  if (wsp->ws_flags & WRDSF_WS)
    {
      /* Skip initial whitespace */
      while (isws (command[i]))
	if (++i == len)
	  return WRDSE_EOF;
    }

  start = i;
      
  wsp->ws_flags &= ~_ARGCV_WORD_MASK;
      
  if (wsp->ws_flags & WRDSF_SED_EXPR
      && command[i] == 's' && i + 3 < len && c_ispunct (command[i+1]))
    {
      wsp->ws_flags |= _ARGCV_WORD_SED_EXPR;
      i = skip_sed_expr (command, i, len);
    }
  else if (!isdelim (command[i], delim))
    {
      while (i < len)
	{
	  if (comment && strchr (comment, command[i]) != NULL)
	    {
	      size_t j;
	      for (j = i + 1; j < len && command[j] != '\n'; j++)
		;
	      *pstart = start;
	      *pend = i;
	      wsp->ws_endp = j;
	      return i > start ? _WRDS_WORD : _WRDS_CONT; 
	    }
	      
	  if (wsp->ws_flags & WRDSF_QUOTE)
	    {
	      if (command[i] == '\\')
		{
		  if (++i == len)
		    break;
		  i++;
		  continue;
		}
	      
	      if (command[i] == '\'' || command[i] == '"')
		{
		  size_t j;
		  for (j = i + 1; j < len && command[j] != command[i]; j++)
		    if (command[j] == '\\')
		      j++;
		  if (j < len && command[j] == command[i])
		    i = j + 1;
		  else
		    {
		      wsp->ws_endp = i;
		      if (wsp->ws_flags & WRDSF_SHOWERR)
			error (0, 0,
			       _("missing closing %c (start near #%lu)"),
			       command[i], (unsigned long) i);
		      return WRDSE_QUOTE;
		    }
		}
	    }

	  if (((wsp->ws_flags & WRDSF_WS) && isws (command[i]))
	      || isdelim (command[i], delim))
	    break;
	  else
	    i++;
	}
    }
  else if (wsp->ws_flags & WRDSF_RETURN_DELIMS)
    i++;
  
  *pstart = start;
  *pend = i;
  wsp->ws_endp = i;

  return _WRDS_WORD;
}

static char quote_transtab[] = "\\\\a\ab\bf\fn\nr\rt\tv\v";

int
wordsplit_unquote_char (int c)
{
  char *p;

  for (p = quote_transtab; *p; p += 2)
    {
      if (*p == c)
	return p[1];
    }
  return c;
}

int
wordsplit_quote_char (int c)
{
  char *p;
  
  for (p = quote_transtab + sizeof(quote_transtab) - 2;
       p > quote_transtab; p -= 2)
    {
      if (*p == c)
	return p[-1];
    }
  return -1;
}
  
#define to_num(c) \
  (isdigit(c) ? c - '0' : (isxdigit(c) ? toupper(c) - 'A' + 10 : 255 ))

static int
xtonum (int *pval, const char *src, int base, int cnt)
{
  int i, val;
  
  for (i = 0, val = 0; i < cnt; i++, src++)
    {
      int n = *(unsigned char*)src;
      if (n > 127 || (n = to_num(n)) >= base)
	break;
      val = val*base + n;
    }
  *pval = val;
  return i;
}

size_t
wordsplit_quoted_length (const char *str, int quote_hex, int *quote)
{
  size_t len = 0;

  *quote = 0;
  for (; *str; str++)
    {
      if (*str == ' ')
	{
	  len++;
	  *quote = 1;
	}
      else if (*str == '"')
	{
	  len += 2;
	  *quote = 1;
	}
      else if (*str != '\t' && *str != '\\' && isprint (*str))
	len++;
      else if (quote_hex)
	len += 3;
      else
	{
	  if (wordsplit_quote_char (*str) != -1)
	    len += 2;
	  else
	    len += 4;
	}
    }
  return len;
}

void
wordsplit_unquote_copy (char *dst, const char *src, size_t n)
{
  int i = 0;
  int c;
  int expect_delim = 0; 
    
  while (i < n)
    {
      switch (src[i])
	{
	case '\'':
	case '"':
	  if (!expect_delim)
	    {
	      const char *p;
	      
	      for (p = src+i+1; *p && *p != src[i]; p++)
		if (*p == '\\')
		  p++;
	      if (*p)
		expect_delim = src[i++];
	      else
		*dst++ = src[i++];
	    }
	  else if (expect_delim == src[i])
	    ++i;
	  else
	    *dst++ = src[i++];
	  break;
	  
	case '\\':
	  ++i;
	  if (src[i] == 'x' || src[i] == 'X')
	    {
	      if (n - i < 2)
		{
		  *dst++ = '\\';
		  *dst++ = src[i++];
		}
	      else 
		{
		  int off = xtonum(&c, src + i + 1, 16, 2);
		  if (off == 0)
		    {
		      *dst++ = '\\';
		      *dst++ = src[i++];
		    }
		  else
		    {
		      *dst++ = c;
		      i += off + 1;
		    }
		}
	    }
	  else if ((unsigned char)src[i] < 128 && isdigit (src[i]))
	    {
	      if (n - i < 1)
		{
		  *dst++ = '\\';
		  *dst++ = src[i++];
		}
	      else
		{
		  int off = xtonum (&c, src+i, 8, 3);
		  if (off == 0)
		    {
		      *dst++ = '\\';
		      *dst++ = src[i++];
		    }
		  else
		    {
		      *dst++ = c;
		      i += off;
		    }
		}
	    }
	  else
	    *dst++ = wordsplit_unquote_char (src[i++]);
	  break;
	  
	default:
	  *dst++ = src[i++];
	}
    }
  *dst = 0;
}

void
wordsplit_quote_copy (char *dst, const char *src, int quote_hex)
{
  for (; *src; src++)
    {
      if (*src == '"')
	{
	  *dst++ = '\\';
	  *dst++ = *src;
	}
      else if (*src != '\t' && *src != '\\' && isprint (*src))
	*dst++ = *src;      
      else
	{
	  char tmp[4];

	  if (quote_hex)
	    {
	      snprintf (tmp, sizeof tmp, "%%%02X", *(unsigned char*)src);
	      memcpy (dst, tmp, 3);
	      dst += 3;
	    }
	  else
	    {
	      int c = wordsplit_quote_char (*src);
	      *dst++ = '\\';
	      if (c != -1)
		*dst++ = c;
	      else
		{
		  snprintf (tmp, sizeof tmp, "%03o", *(unsigned char*)src);
		  memcpy (dst, tmp, 3);
		  dst += 3;
		}
	    }
	}
    }
}

int
wordsplit_len (const char *command, size_t len, struct wordsplit *wsp,
	       int flags)
{
  int rc;
  size_t start = 0, end = 0;
  
  rc = wordsplit_init (wsp, command, len, flags);
  if (rc)
    return rc;

  for (; (rc = scan_word (wsp, &start, &end)) > 0; start = skip_delim (wsp))
    {
      int unquote = 1;
      size_t n;
      char *p;

      if (rc == _WRDS_CONT)
	continue;
	
      if (alloc_space (wsp))
	return WRDSE_NOSPACE;

      n = end - start;
      
      if (wsp->ws_flags & WRDSF_QUOTE &&
	  !(wsp->ws_flags & _ARGCV_WORD_SED_EXPR))
	{
	  if (start < end
	      && (command[start] == '"' || command[start] == '\'')
	      && command[end-1] == command[start])
	    {
	      unquote = command[start] == '"';
	      start++;
	      n -= 2;
	    }
	}
      else 
	unquote = 0;
      
      p = malloc (n + 1);
      if (!p)
	{
	  if (wsp->ws_flags & WRDSF_ENOMEMABRT)
	    xalloc_die ();
	  if (wsp->ws_flags & WRDSF_SHOWERR)
	    error (0, 0, _("memory exhausted"));
	  if (!(wsp->ws_flags & WRDSF_REUSE))
	    wordsplit_free (wsp);
	  errno = ENOMEM;
	  return WRDSE_NOSPACE;
	}
      
      if (unquote)
	wordsplit_unquote_copy (p, &command[start], n);
      else
	{
	  memcpy (p, &command[start], n);
	  p[n] = 0;
	}
      wsp->ws_wordv[wsp->ws_offs + wsp->ws_wordc] = p;
      wsp->ws_wordc++;

      ;
    }
  if (alloc_space (wsp))
    return WRDSE_NOSPACE;
  wsp->ws_wordv[wsp->ws_offs + wsp->ws_wordc] = NULL;
  /* FIXME: if (rc) free(ws) */
  return rc;
}

int
wordsplit (const char *command, struct wordsplit *ws, int flags)
{
  return wordsplit_len (command, strlen (command), ws, flags);
}
      
void
wordsplit_free (struct wordsplit *ws)
{
  free (ws->ws_wordv);
  ws->ws_wordv = NULL;
}



