I wrote a test unit to figure out what was happening with the RuleBasedCollator, and I've fixed it. These are the issues I came up against:
- The class was generating a
ParseException, but Cocoon or Saxon were silently swallowing the exception, and defaulting to a normal alpha sort. Until I ran the test unit, I didn't know the exception was being triggered.
- The exception was caused by the fact that I had included both decomposed and composed versions of (for instance) a with a dot below. It turns out that the RuleBasedCollator class actually does normalization itself before doing comparisons, so there was no need for this, and in fact it was throwing the ParseException because it looked as though I was asserting that a character was equal to itself (or unequal -- the exception wasn't very clear on this).
- There were a couple of typos in Unicode codepoints for raised W, which were causing the precise sort I was using for testing to fail anyway.
- There was a stray combining-dot-below, virtually invisible and hard to select, in the rule definition, making it unparseable.
The collator now seems to be working fine. The code is below:
package ca.uvic.hcmc.moses;
/**
*
* @author mholmes
*/
import java.text.ParseException;
import java.text.RuleBasedCollator;
public class MosesCollation extends RuleBasedCollator{
public MosesCollation() throws ParseException
{
super(mosesRules);
}
/*
* Commented-out statements below are replaced by simpler ones, because the
* RuleBasedCollator automatically does Unicode normalization before it does
* its comparisons; including "parallel" versions of characters was triggering
* a ParseException.
*/
private static String glottal = new String("\u0294");
private static String a = new String("a,a\u0301,a\u0300,\u00e1,\u00e0");
//private static String aDot = new String("\u1ea1,a\u0323,\u1ea1\u0301,a\u0323\u0300,\u00e1\u0323,\u00e0\u0323");
private static String aDot = new String("\u1ea1,\u1ea1\u0301,\u1ea1\u0300,\u00e1\u0323,\u00e0\u0323");
private static String cDot = new String("c\u0323");
private static String cApos = new String("c\u02bc");
private static String schwa = new String("\u0259,\u0259\u0301,\u0259\u0300");
private static String schwaDot = new String("\u0259\u0323,\u0259\u0323\u0301,\u0259\u0323\u0300");
//private static String hDot = new String("\u1e25,h\u0323");
private static String hDot = new String("\u1e25");
//private static String hDotW = new String("\u1e25\u02b7,h\u0323\u02b7");
private static String hDotW = new String("\u1e25\u02b7");
private static String i = new String("i,i\u0301,i\u0300,\u00ed,\u00ec");
private static String iDot = new String("\u1ecb,\u1ecb\u0301,\u1ecb\u0300,i\u0323\u0301,i\u0323\u0300,\u00ed\u0323,\u00ec\u0323");
private static String kApos = new String("k\u02bc");
private static String kW = new String("k\u02b7");
private static String kAposW = new String("k\u02bc\u02b7");
//private static String lDot = new String("\u1e37,l\u0323");
private static String lDot = new String("\u1e37");
private static String lGlot = new String("l\u02c0");
//private static String lDotGlot = new String("\u1e37\u02c0,l\u0323\u02c0");
private static String lDotGlot = new String("\u1e37\u02c0");
private static String lBelt = new String("\u026c");
private static String barLamApos = new String("\u019b\u02bc");
private static String mGlot = new String("m\u02c0");
private static String nGlot = new String("n\u02c0");
private static String pApos = new String("p\u02bc");
private static String qApos = new String("q\u02bc");
private static String qW = new String("q\u02b7");
private static String qAposW = new String("q\u02bc\u02b7");
private static String rGlot = new String("r\u02c0");
//private static String sDot = new String("\u1e63,s\u0323");
private static String sDot = new String("\u1e63");
private static String tApos = new String("t\u02bc");
private static String u = new String("u,u\u0301,u\u0300,\u00fa,\u00f9");
private static String uDot = new String("\u1ee5,\u1ee5\u0301,\u1ee5\u0300,u\u0323\u0301,u\u0323\u0300,\u00fa\u0323,\u00f9\u0323");
private static String wGlot = new String("w\u02c0");
private static String xW = new String("x\u02b7");
private static String xDot = new String("x\u0323");
private static String xDotW = new String("x\u0323\u02b7");
private static String yGlot = new String("y\u02c0");
private static String phar = new String("\u0295");
private static String pharGlot = new String("\u0295\u02c0");
private static String pharW = new String("\u0295\u02b7");
private static String pharGlotW = new String("\u0295\u02c0\u02b7");
private static String mosesRules =
("< " + glottal + " < " + a + " < " + aDot + " < c " +
" < " + cDot + " < " + cApos + " < " + schwa + " < " + schwaDot +
" < h < " + hDot + " < " + hDotW + " < i " +
" < " + iDot + " < k < " + kApos + " < " + kW +
" < " + kAposW + " < l < " + lDot + " < " + lGlot +
" < " + lDotGlot + " < " + lBelt + " < " + barLamApos + " < m " +
" < " + mGlot + " < n < " + nGlot + " < p " +
" < " + pApos + " < q < " + qApos + " < " + qW +
" < " + qAposW + " < r < " + rGlot + " < s " +
" < " + sDot + " < t < " + tApos + " < " + u +
" < " + uDot + " < w < " + wGlot + " < x " +
" < " + xW + " < " + xDot + " < " + xDotW + " < y " +
" < " + yGlot + " < " + phar + " < " + pharGlot + " < " + pharW +
" < " + pharGlotW);
}