import java.io.IOException;
<<<<<<< HEAD
import edu.jhu.thrax.datatypes.*;
import edu.jhu.thrax.util.exceptions.*;
import edu.jhu.thrax.util.Vocabulary;
import edu.jhu.thrax.util.ConfFileParser;
import edu.jhu.thrax.util.io.InputUtilities;
import edu.jhu.thrax.ThraxConfig;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Mapper;
/**
* This class extracts Hiero-style SCFG rules. The inputs that are needed
* are "source" "target" and "alignment", which are the source and target
* sides of a parallel corpus, and an alignment between each of the sentences.
*/
public class HierarchicalRuleExtractor implements RuleExtractor {
public int INIT_LENGTH_LIMIT = 10;
public int NONLEX_SOURCE_LENGTH_LIMIT = 5;
public int NONLEX_SOURCE_WORD_LIMIT = 5;
public int NONLEX_TARGET_LENGTH_LIMIT = 5;
public int NONLEX_TARGET_WORD_LIMIT = 5;
public int NT_LIMIT = 2;
public int LEXICAL_MINIMUM = 1;
public boolean ALLOW_ADJACENT_NTS = false;
public boolean ALLOW_LOOSE_BOUNDS = false;
public boolean ALLOW_FULL_SENTENCE_RULES = true;
public boolean ALLOW_ABSTRACT = false;
public boolean ALLOW_X_NONLEX = false;
public int RULE_SPAN_LIMIT = 12;
public int LEX_TARGET_LENGTH_LIMIT = 12;
public int LEX_SOURCE_LENGTH_LIMIT = 12;
public boolean SOURCE_IS_PARSED = false;
public boolean TARGET_IS_PARSED = false;
public boolean REVERSE = false;
private SpanLabeler labeler;
private Collection defaultLabel;
private Mapper.Context context;
/**
* Default constructor. The grammar parameters are initalized according
* to how they are set in the thrax config file.
*/
public HierarchicalRuleExtractor(Mapper.Context mapContext, Configuration conf, SpanLabeler labeler)
{
this.context = mapContext;
this.labeler = labeler;
INIT_LENGTH_LIMIT = conf.getInt("thrax.initial-phrase-length", 10);
NONLEX_SOURCE_LENGTH_LIMIT = conf.getInt("thrax.nonlex-source-length", 5);
NONLEX_SOURCE_WORD_LIMIT = conf.getInt("thrax.nonlex-source-words", 5);
NONLEX_TARGET_LENGTH_LIMIT = conf.getInt("thrax.nonlex-target-length", 5);
NONLEX_TARGET_WORD_LIMIT = conf.getInt("thrax.nonlex-target-words", 5);
NT_LIMIT = conf.getInt("thrax.arity", 2);
LEXICAL_MINIMUM = conf.getInt("thrax.lexicality", 1);
ALLOW_ADJACENT_NTS = conf.getBoolean("thrax.adjacent-nts", false);
ALLOW_LOOSE_BOUNDS = conf.getBoolean("thrax.loose", false);
ALLOW_FULL_SENTENCE_RULES = conf.getBoolean("thrax.allow-full-sentence-rules", true);
ALLOW_ABSTRACT = conf.getBoolean("thrax.allow-abstract-rules", false);
ALLOW_X_NONLEX = conf.getBoolean("thrax.allow-nonlexical-x", false);
RULE_SPAN_LIMIT = conf.getInt("thrax.rule-span-limit", 12);
LEX_TARGET_LENGTH_LIMIT = conf.getInt("thrax.lex-target-words", 12);
LEX_SOURCE_LENGTH_LIMIT = conf.getInt("thrax.lex-source-words", 12);
SOURCE_IS_PARSED = conf.getBoolean("thrax.source-is-parsed", false);
TARGET_IS_PARSED = conf.getBoolean("thrax.target-is-parsed", false);
// a backwards-compatibility hack for matt
if (conf.get("thrax.english-is-parsed") != null)
TARGET_IS_PARSED = conf.getBoolean("thrax.english-is-parsed", false);
int defaultID = Vocabulary.getId(conf.get("thrax.default-nt", "X"));
REVERSE = conf.getBoolean("thrax.reverse", false);
defaultLabel = new HashSet();
defaultLabel.add(defaultID);
}
public List extract(String inp) throws MalformedInputException
{
String [] inputs = inp.split(ThraxConfig.DELIMITER_REGEX);
if (inputs.length < 3) {
throw new NotEnoughFieldsException();
}
String [] sourceWords = InputUtilities.getWords(inputs[0], SOURCE_IS_PARSED);
String [] targetWords = InputUtilities.getWords(inputs[1], TARGET_IS_PARSED);
if (sourceWords.length == 0 || targetWords.length == 0)
throw new EmptySentenceException();
int [] source = Vocabulary.getIds(sourceWords);
int [] target = Vocabulary.getIds(targetWords);
if (REVERSE) {
int [] tmp = source;
source = target;
target = tmp;
}
Alignment alignment = new Alignment(inputs[2], REVERSE);
if (alignment.isEmpty())
throw new EmptyAlignmentException();
if (!alignment.consistent(source.length, target.length)) {
StringBuilder sb = new StringBuilder();
sb.append(String.format("source: %s (length %d)\n", inputs[0], source.length));
sb.append(String.format("target: %s (length %d)\n", inputs[1], target.length));
sb.append("alignment: " + inputs[2]);
throw new InconsistentAlignmentException(sb.toString());
}
PhrasePair [][] phrasesByStart = initialPhrasePairs(source, target, alignment);
labeler.setInput(inp);
Queue q = new LinkedList();
for (int i = 0; i < source.length; i++)
q.offer(new Rule(source, target, alignment, i, NT_LIMIT));
=======
public class HierarchicalRuleExtractor
{
private int arityLimit = 2;
private int initialPhraseSourceLimit = 10;
private int initialPhraseTargetLimit = 10;
private boolean requireMinimalPhrases = true;
private int minimumInitialAlignmentPoints = 1;
private boolean allowAdjacent = false;
private int sourceSymbolLimit = 5;
private int targetSymbolLimit = 1000;
private int minimumRuleAlignmentPoints = 1;
private boolean allowAbstract = false;
public HierarchicalRuleExtractor()
{
// just use the defaults!
}
>>>>>>> e00e5499ed868ec9ad4c03b7c320143811e1eda2
public HierarchicalRuleExtractor(int arity,
int initialPhraseSource, |