#!/usr/bin/gawk -f

#
# Copyright(C) 2007-2012 National Institute of Information and Communications Technology
#

# EUC-JP¸뤳
# CRF++νϤ*.tsvľ

BEGIN {
  FS = "\t";
  OFS = "\t";

  if (ARGC < 2) {
    printf "usage: %s <*.tsv file> [<*.otag>]\n", SCRIPT > "/dev/stderr";
    EXIT = 1;
    exit EXIT;
  }
  tsvfile = ARGV[1];
  ARGV[1] = "";

  buf = "";
  cnt = 0;
  sen = "";
  extnum = 0;
}

{
  if (buf != "" && $7 != "I") {
    vec[cnt++] = buf;
    buf = "";
  }

  if ($0 == "") {
    if (cnt == 0) vec[0] = "";
    extent[extnum] = vec[0];
    extsen[extnum] = sen;
    extnum++;

    cnt = 0;
    sen = "";
  } else {
    if ($7 == "B") buf = $1;
    if ($7 == "I") buf = buf $1;
    sen = sen $1;
  }
}

END {
#Կ򥫥
  nlines++;

  if (EXIT != "") exit EXIT;

  for (y = 0; ; ) {
    r = getline < tsvfile;
    if (r == 0) break;
    if (r < 0) error("file I/O error");
    if ($10 == "") {
      print;
      continue;
    }

    n = split($7, item, /\\n/);
    res = "";
    for (i = 1; i <= n; i++) {
      if (item[i] != "[]") {
	if ($5 != extsen[y]) {
# tsv.pm¦ǽϤ뤿
#        showError("sentence mismatch");
#        printf " = Line:%d\n", nlines > "/dev/stderr";
          continue;
        }
	if (extent[y] == "") {
	  extent[y] = "[]";
	}
	item[i] = extent[y];
	y++;
      }
      res = res "\\n" item[i];
    }
    res = substr(res, 3);
    $7 = res;
    print;
  }
  if (y != extnum) error("#entities mismatch");
}
