Projects >> book >>378839ab4eb7526d3210c0559d4ed857ea4ed8b0

Chunk
Conflicting content
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
<<<<<<< HEAD
=======
import org.apache.mahout.classifier.ClassifierResult;

>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.update.AddUpdateCommand;
Solution content
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.update.AddUpdateCommand;
File
BayesUpdateRequestProcessor.java
Developer's decision
Version 1
Kind of conflict
Import
Chunk
Conflicting content
package com.tamingtext.classifier.bayes;

<<<<<<< HEAD
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

=======
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
Solution content
package com.tamingtext.classifier.bayes;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Import
Chunk
Conflicting content
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.document.Document;
<<<<<<< HEAD
import org.apache.lucene.document.Fieldable;
=======
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiFields;
Solution content
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.Tool;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Import
Chunk
Conflicting content
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

<<<<<<< HEAD
import com.google.common.io.Closeables;

/** A utility to extract training data from a Lucene index using document term vectors to recreate the list of terms
 *  found in each document. Writes output in Mahout Document Sequence File Format with the category and id as the key and the text as the value */
public class ExtractTrainingData extends Configured implements Tool {

  private static final Logger log = LoggerFactory.getLogger(ExtractTrainingData.class);
  private final Map writers = new HashMap();
  
  public static void main(String[] args) throws Exception {
      ToolRunner.run(new ExtractTrainingData(), args);
      //System.exit(ret);
  }

  public int run(String[] args) throws IOException {

    Configuration conf = getConf();
    FileSystem fs = FileSystem.getLocal(conf);
    
=======
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

/**
 * A utility to extract training data from a Lucene index using document term vectors to recreate the list of terms
 * found in each document. Writes output in Mahout Bayes classifier input format
 */
public class ExtractTrainingData {

  private static final Logger log = LoggerFactory.getLogger(ExtractTrainingData.class);

  static final Map trainingWriters = new HashMap();

  public static void main(String[] args) {

>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
    log.info("Command-line arguments: " + Arrays.toString(args));

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
Solution content
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.io.Closeables;


/** A utility to extract training data from a Lucene index using document term vectors to recreate the list of terms
 *  found in each document. Writes output in Mahout Document Sequence File Format with the category and id as the key and the text as the value */
public class ExtractTrainingData extends Configured implements Tool {

  private static final Logger log = LoggerFactory.getLogger(ExtractTrainingData.class);
  private final Map writers = new HashMap();
  
  public static void main(String[] args) throws Exception {
      ToolRunner.run(new ExtractTrainingData(), args);
      //System.exit(ret);
  }

  public int run(String[] args) throws IOException {

    Configuration conf = getConf();
    FileSystem fs = FileSystem.getLocal(conf);
    
    log.info("Command-line arguments: " + Arrays.toString(args));

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Attribute
Class signature
Comment
Import
Method declaration
Method invocation
Method signature
Variable
Chunk
Conflicting content
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("dir")
<<<<<<< HEAD
        .withRequired(true)
        .withArgument(
            abuilder.withName("dir")
            .withMinimum(1)
            .withMaximum(1).create())
=======
            .withRequired(true)
            .withArgument(
                    abuilder.withName("dir")
                            .withMinimum(1)
                            .withMaximum(1).create())
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
            .withDescription("Lucene index directory containing input data")
            .withShortName("d").create();
Solution content
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("dir")
        .withRequired(true)
        .withArgument(
            abuilder.withName("dir")
            .withMinimum(1)
            .withMaximum(1).create())
            .withDescription("Lucene index directory containing input data")
            .withShortName("d").create();
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Method invocation
Chunk
Conflicting content
            .withShortName("d").create();

    Option categoryOpt = obuilder.withLongName("categories")
<<<<<<< HEAD
        .withRequired(true)
        .withArgument(
            abuilder.withName("file")
            .withMinimum(1)
            .withMaximum(1).create())
=======
            .withRequired(true)
            .withArgument(
                    abuilder.withName("file")
                            .withMinimum(1)
                            .withMaximum(1).create())
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
            .withDescription("File containing a list of categories")
            .withShortName("c").create();
Solution content
            .withShortName("d").create();

    Option categoryOpt = obuilder.withLongName("categories")
        .withRequired(true)
        .withArgument(
            abuilder.withName("file")
            .withMinimum(1)
            .withMaximum(1).create())
            .withDescription("File containing a list of categories")
            .withShortName("c").create();
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Method invocation
Chunk
Conflicting content
        .create();
=======
            .withShortName("c").create();

    Option outputOpt = obuilder.withLongName("output")
<<<<<<< HEAD
        .withRequired(false)
        .withArgument(
            abuilder.withName("output")
            .withMinimum(1)
            .withMaximum(1).create())
            .withDescription("Output directory")
            .withShortName("o").create();

    Option categoryFieldsOpt = 
        obuilder.withLongName("category-fields")
        .withRequired(true)
        .withArgument(
            abuilder.withName("fields")
            .withMinimum(1)
            .withMaximum(1)
            .create())
            .withDescription("Fields to match categories against (comma-delimited)")
            .withShortName("cf").create();

    Option textFieldsOpt = 
        obuilder.withLongName("text-fields")
        .withRequired(true)
        .withArgument(
            abuilder.withName("fields")
            .withMinimum(1)
            .withMaximum(1)
            .create())
            .withDescription("Fields from which to extract training text (comma-delimited)")
            .withShortName("tf").create();

    Option useTermVectorsOpt = obuilder.withLongName("use-term-vectors")
        .withDescription("Extract term vectors containing preprocessed data " +
            "instead of unprocessed, stored text values")
            .withShortName("tv").create();

    Option helpOpt = obuilder.withLongName("help")
        .withDescription("Print out help")
        .withShortName("h").create();

    Group group = gbuilder.withName("Options")
        .withOption(inputOpt)
        .withOption(categoryOpt)
        .withOption(outputOpt)
        .withOption(categoryFieldsOpt)
        .withOption(textFieldsOpt)
        .withOption(useTermVectorsOpt)
            .withRequired(false)
            .withArgument(
                    abuilder.withName("output")
                            .withMinimum(1)
                            .withMaximum(1).create())
            .withDescription("Output directory")
            .withShortName("o").create();

    Option categoryFieldsOpt =
            obuilder.withLongName("category-fields")
                    .withRequired(true)
                    .withArgument(
                            abuilder.withName("fields")
                                    .withMinimum(1)
                                    .withMaximum(1)
                                    .create())
                    .withDescription("Fields to match categories against (comma-delimited)")
                    .withShortName("cf").create();

    Option textFieldsOpt =
            obuilder.withLongName("text-fields")
                    .withRequired(true)
                    .withArgument(
                            abuilder.withName("fields")
                                    .withMinimum(1)
                                    .withMaximum(1)
                                    .create())
                    .withDescription("Fields from which to extract training text (comma-delimited)")
                    .withShortName("tf").create();

    Option useTermVectorsOpt = obuilder.withLongName("use-term-vectors")
            .withDescription("Extract term vectors containing preprocessed data " +
                    "instead of unprocessed, stored text values")
            .withShortName("tv").create();

    Option helpOpt = obuilder.withLongName("help")
            .withDescription("Print out help")
            .withShortName("h").create();

    Group group = gbuilder.withName("Options")
            .withOption(inputOpt)
            .withOption(categoryOpt)
            .withOption(outputOpt)
            .withOption(categoryFieldsOpt)
            .withOption(textFieldsOpt)
            .withOption(useTermVectorsOpt)
            .create();
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6

    try {
      Parser parser = new Parser();
Solution content
            .withShortName("c").create();

    Option outputOpt = obuilder.withLongName("output")
        .withRequired(false)
        .withArgument(
            abuilder.withName("output")
            .withMinimum(1)
            .withMaximum(1).create())
            .withDescription("Output directory")
            .withShortName("o").create();

    Option categoryFieldsOpt = 
        obuilder.withLongName("category-fields")
        .withRequired(true)
        .withArgument(
            abuilder.withName("fields")
            .withMinimum(1)
            .withMaximum(1)
            .create())
            .withDescription("Fields to match categories against (comma-delimited)")
            .withShortName("cf").create();

    Option textFieldsOpt = 
        obuilder.withLongName("text-fields")
        .withRequired(true)
        .withArgument(
            abuilder.withName("fields")
            .withMinimum(1)
            .withMaximum(1)
            .create())
            .withDescription("Fields from which to extract training text (comma-delimited)")
            .withShortName("tf").create();

    Option useTermVectorsOpt = obuilder.withLongName("use-term-vectors")
        .withDescription("Extract term vectors containing preprocessed data " +
            "instead of unprocessed, stored text values")
            .withShortName("tv").create();

    Option helpOpt = obuilder.withLongName("help")
        .withDescription("Print out help")
        .withShortName("h").create();

    Group group = gbuilder.withName("Options")
        .withOption(inputOpt)
        .withOption(categoryOpt)
        .withOption(outputOpt)
        .withOption(categoryFieldsOpt)
        .withOption(textFieldsOpt)
        .withOption(useTermVectorsOpt)
        .create();

    try {
      Parser parser = new Parser();
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Method invocation
Variable
Chunk
Conflicting content
        return 1;
      }

<<<<<<< HEAD
      Path inputDir = new Path(cmdLine.getValue(inputOpt).toString());
      
      if (!fs.getFileStatus(inputDir).isDir()) {
        throw new IllegalArgumentException(inputDir + " does not exist or is not a directory");
      }

      Path categoryFile = new Path(cmdLine.getValue(categoryOpt).toString());

      if (!fs.isFile(categoryFile)) {
        throw new IllegalArgumentException(categoryFile + " does not exist or is not a directory");
      }

      Path outputDir = new Path(cmdLine.getValue(outputOpt).toString());
      fs.mkdirs(outputDir);
     
      if (!fs.getFileStatus(outputDir).isDir()) {
        throw new IllegalArgumentException("the output directory for " + outputDir + " is not a directory or could not be created");
=======
      File inputDir = new File(cmdLine.getValue(inputOpt).toString());

      if (!inputDir.isDirectory()) {
        throw new IllegalArgumentException(inputDir + " does not exist or is not a directory");
      }

      File categoryFile = new File(cmdLine.getValue(categoryOpt).toString());

      if (!categoryFile.isFile()) {
        throw new IllegalArgumentException(categoryFile + " does not exist or is not a directory");
      }

      File outputDir = new File(cmdLine.getValue(outputOpt).toString());

      outputDir.mkdirs();

      if (!outputDir.isDirectory()) {
        throw new IllegalArgumentException(outputDir + " is not a directory or could not be created");
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
      }
      Collection categoryFields = stringToList(cmdLine.getValue(categoryFieldsOpt).toString());
Solution content
        return 1;
      }

      Path inputDir = new Path(cmdLine.getValue(inputOpt).toString());
      
      if (!fs.getFileStatus(inputDir).isDir()) {
        throw new IllegalArgumentException(inputDir + " does not exist or is not a directory");
      }

      Path categoryFile = new Path(cmdLine.getValue(categoryOpt).toString());

      if (!fs.isFile(categoryFile)) {
        throw new IllegalArgumentException(categoryFile + " does not exist or is not a directory");
      }

      Path outputDir = new Path(cmdLine.getValue(outputOpt).toString());
      fs.mkdirs(outputDir);
     
      if (!fs.getFileStatus(outputDir).isDir()) {
        throw new IllegalArgumentException("the output directory for " + outputDir + " is not a directory or could not be created");
      }

      Collection categoryFields = stringToList(cmdLine.getValue(categoryFieldsOpt).toString());
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
If statement
Method invocation
Throw statement
Variable
Chunk
Conflicting content
      boolean useTermVectors = cmdLine.hasOption(useTermVectorsOpt);

<<<<<<< HEAD
      extractTraininingData(fs, conf, inputDir, categoryFile, categoryFields, textFields, outputDir, useTermVectors);
=======
      extractTraininingData(inputDir, categoryFile, categoryFields, textFields, outputDir, useTermVectors);
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6

    } catch (OptionException e) {
      log.error("Exception", e);
Solution content
      boolean useTermVectors = cmdLine.hasOption(useTermVectorsOpt);

      extractTraininingData(fs, conf, inputDir, categoryFile, categoryFields, textFields, outputDir, useTermVectors);

    } catch (OptionException e) {
      log.error("Exception", e);
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Method invocation
Chunk
Conflicting content
   * @param outputDir      directory to write output to
   * @throws IOException
   */
<<<<<<< HEAD
  public void extractTraininingData(FileSystem fs, Configuration conf, Path indexDir, Path categoryFile, 
      Collection categoryFields, Collection textFields, Path outputDir, boolean useTermVectors) throws IOException {
=======
  public static void extractTraininingData(File indexDir, File categoryFile,
                                           Collection categoryFields, Collection textFields, File outputDir, boolean useTermVectors) throws IOException {
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6

    log.info("Index dir: " + indexDir);
    log.info("Category file: " + categoryFile);
Solution content
   * @throws IOException
   */
  public void extractTraininingData(FileSystem fs, Configuration conf, Path indexDir, Path categoryFile, 
      Collection categoryFields, Collection textFields, Path outputDir, boolean useTermVectors) throws IOException {

    log.info("Index dir: " + indexDir);
    log.info("Category file: " + categoryFile);
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Method signature
Chunk
Conflicting content
    log.info("Text fields: " + textFields.toString());
    log.info("Use Term Vectors?: " + useTermVectors);
    OpenObjectIntHashMap categoryCounts = new OpenObjectIntHashMap();
<<<<<<< HEAD
    Map> categories = readCategoryFile(fs, categoryFile);

    Directory dir = FSDirectory.open(new File(indexDir.toUri().getPath()));
    IndexReader reader = IndexReader.open(dir);
    int max = reader.maxDoc();

    StringBuilder buf = new StringBuilder();
    Text key   = new Text();
    Text value = new Text();
    fs.delete(outputDir, true);

    for (int i=0; i < max; i++) {
      if (!reader.isDeleted(i)) {
=======
    Map> categories = readCategoryFile(categoryFile);
    Directory dir = FSDirectory.open(indexDir);
    IndexReader reader = DirectoryReader.open(dir);
    int max = reader.maxDoc();

    StringBuilder buf = new StringBuilder();
    Bits liveDocs = MultiFields.getLiveDocs(reader);//is null if there are no deletions
    for (int i = 0; i < max; i++) {
      if (liveDocs == null || liveDocs.get(i)) {//if the bit is set, that means it is "live", i.e. not deleted
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
        Document d = reader.document(i);
        String category = null;
Solution content
    log.info("Text fields: " + textFields.toString());
    log.info("Use Term Vectors?: " + useTermVectors);
    OpenObjectIntHashMap categoryCounts = new OpenObjectIntHashMap();
    Map> categories = readCategoryFile(fs, categoryFile);

    Directory dir = FSDirectory.open(new File(indexDir.toUri().getPath()));
    IndexReader reader = IndexReader.open(dir);
    int max = reader.maxDoc();

    StringBuilder buf = new StringBuilder();
    Text key   = new Text();
    Text value = new Text();
    fs.delete(outputDir, true);

    for (int i=0; i < max; i++) {
      if (!reader.isDeleted(i)) {
        Document d = reader.document(i);
        String category = null;
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Comment
For statement
If statement
Method invocation
Variable
Chunk
Conflicting content
        // determine whether any of the fields in this document contain a 
        // category in the category list
<<<<<<< HEAD
        fields: for (String field: categoryFields) {
          for (Fieldable f: d.getFieldables(field)) {
            if (f.isStored() && !f.isBinary()) {
=======
        fields:
        for (String field : categoryFields) {
          for (IndexableField f : d.getFields(field)) {
            if (f.stringValue() != null) {
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
              String fieldValue = f.stringValue().toLowerCase();
              for (String cat : categories.keySet()) {
                List cats = categories.get(cat);
Solution content
        // determine whether any of the fields in this document contain a 
        // category in the category list
        fields: for (String field: categoryFields) {
          for (Fieldable f: d.getFieldables(field)) {
            if (f.isStored() && !f.isBinary()) {
              String fieldValue = f.stringValue().toLowerCase();
              for (String cat: categories.keySet()) {
                List cats = categories.get(cat);
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
For statement
If statement
Chunk
Conflicting content
        buf.setLength(0);
        for (String field : textFields) {
          if (useTermVectors) {
<<<<<<< HEAD
            appendVectorTerms(buf, reader.getTermFreqVector(i, field));
          }
          else {
            appendFieldText(buf, d.getFieldable(field));
=======
            appendVectorTerms(buf, reader.getTermVector(i, field), liveDocs);
          } else {
            appendFieldText(buf, d.getField(field));
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
          }
        }
        key.set(i + "/" + category);
Solution content
        buf.setLength(0);
        for (String field: textFields) {
          if (useTermVectors) {
            appendVectorTerms(buf, reader.getTermFreqVector(i, field));
          }
          else {
            appendFieldText(buf, d.getFieldable(field));
          }
        }
        key.set(i + "/" + category);
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Method invocation
Chunk
Conflicting content
    closeWriters();
    
      }
    }

<<<<<<< HEAD
=======
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
    if (log.isInfoEnabled()) {
      StringBuilder b = new StringBuilder();
      b.append("\nCatagory document counts:\n");
Solution content
      }
    }

    closeWriters();
    
    if (log.isInfoEnabled()) {
      StringBuilder b = new StringBuilder();
      b.append("\nCatagory document counts:\n");
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Method invocation
Chunk
Conflicting content
   * @return
   * @throws IOException
   */
<<<<<<< HEAD
  public static Map> readCategoryFile(FileSystem fs, Path categoryFile) throws IOException {
    Map> categoryMap = new HashMap>();
    BufferedReader rin = new BufferedReader(new InputStreamReader(fs.open(categoryFile), "UTF-8"));
=======
  public static Map> readCategoryFile(File categoryFile) throws IOException {
    Map> categoryMap = new HashMap>();
    BufferedReader rin = new BufferedReader(new InputStreamReader(new FileInputStream(categoryFile), "UTF-8"));
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
    String line;
    while ((line = rin.readLine()) != null) {
      String[] parts = line.trim().toLowerCase().split("\\s+");
Solution content
   * @return
   * @throws IOException
   */
  public static Map> readCategoryFile(FileSystem fs, Path categoryFile) throws IOException {
    Map> categoryMap = new HashMap>();
    BufferedReader rin = new BufferedReader(new InputStreamReader(fs.open(categoryFile), "UTF-8"));
    String line;
    while ((line = rin.readLine()) != null) {
      String[] parts = line.trim().toLowerCase().split("\\s+");
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Method invocation
Method signature
Variable
Chunk
Conflicting content
    }
    return out;
  }
<<<<<<< HEAD
  
  /** Close writers opened by {@link #getWriterForCategory(File, String)} */
  protected void closeWriters() {
    for (SequenceFile.Writer w: writers.values()) {
        Closeables.closeQuietly(w);
    }
  }

  /** Append the contents of the specified termVector to a buffer containing a list of terms
   * 
=======

  /**
   * Close writers opened by {@link #getWriterForCategory(File, String)}
   */
  protected static void closeWriters() {
    for (PrintWriter p : trainingWriters.values()) {
      p.close();
    }
  }

  /**
   * Append the contents of the specified termVector to a buffer containing a list of terms
   *
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
   * @param buf
   * @param tv
   * @param liveDocs
Solution content
    }
    return out;
  }
  
  /** Close writers opened by {@link #getWriterForCategory(File, String)} */
  protected void closeWriters() {
    for (SequenceFile.Writer w: writers.values()) {
        Closeables.closeQuietly(w);
    }
  }

  /** Append the contents of the specified termVector to a buffer containing a list of terms
   * 
   * @param buf
   * @param tv
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Comment
Method declaration
Chunk
Conflicting content
  protected static void appendVectorTerms(StringBuilder buf, Terms tv, Bits liveDocs) throws IOException {
    if (tv == null) return;

<<<<<<< HEAD
    String[] terms = tv.getTerms();
    int[] frequencies = tv.getTermFrequencies();

    for (int j=0; j < terms.length; j++) {
      int freq = frequencies[j];
      String term = terms[j];
      for (int k=0; k < freq; k++) {
        buf.append(term).append(' ');
=======
    TermsEnum terms = tv.iterator(null);
    BytesRef ref = null;
    while ((ref = terms.next()) != null) {
      int freq = getFrequency(tv, terms, liveDocs);
      for (int k = 0; k < freq; k++) {
        buf.append(new String(ref.bytes, ref.offset, ref.length));//TODO: do we need a charset here?
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
      }
    }
  }
Solution content
    if (tv == null) return;

    String[] terms = tv.getTerms();
    int[] frequencies = tv.getTermFrequencies();

    for (int j=0; j < terms.length; j++) {
      int freq = frequencies[j];
      String term = terms[j];
      for (int k=0; k < freq; k++) {
        buf.append(term).append(' ');
      }
    }
  }
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Array access
Comment
For statement
Method invocation
Variable
While statement
Chunk
Conflicting content
   * @param buf
   * @param f
   */
<<<<<<< HEAD
  protected static void appendFieldText(StringBuilder buf, Fieldable f) {
=======
  protected static void appendFieldText(StringBuilder buf, IndexableField f) {
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
    if (f == null) return;
    if (f.binaryValue() != null) return;
    if (f.stringValue() == null) return;
Solution content
   * @param buf
   * @param f
   */
  protected static void appendFieldText(StringBuilder buf, Fieldable f) {
    if (f == null) return;
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Method signature
Chunk
Conflicting content
    buf.append(s);
  }

<<<<<<< HEAD
  /** Split a comma-delimited set of strings into a list
   * 
=======
  /**
   * Split a comma-delimited set of strings into a list
   *
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
   * @param input
   * @return
   */
Solution content
    buf.append(s);
  }

  /** Split a comma-delimited set of strings into a list
   * 
   * @param input
   * @return
   */
File
ExtractTrainingData.java
Developer's decision
Version 1
Kind of conflict
Comment
Chunk
Conflicting content
    
  }
  
<<<<<<< HEAD
  /** 
   * Defines index management mechanisms for Lucene based classifier model.
=======
  /** builda a lucene index suidable for knn based classification. Each category's content is indexed into
   *  separate documents in the index, and the category that has the haghest count in the tip N hits is 
   *  is the category that is assigned.
   * @param inputFiles
   * @throws Exception
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
   */
  public static abstract class IndexingCallback implements TwentyNewsgroupsCorpus.Callback {
    protected IndexWriter writer;
Solution content
    
  }
  
  /** 
   * Defines index management mechanisms for Lucene based classifier model.
   */
  public static abstract class IndexingCallback implements TwentyNewsgroupsCorpus.Callback {
    protected IndexWriter writer;
File
TrainMoreLikeThis.java
Developer's decision
Version 1
Kind of conflict
Comment
Chunk
Conflicting content
      
      writer.addDocument(d); //

<<<<<<< HEAD
=======
        Document d = new Document(); //
        id.setStringValue(category + "-" + lineCount++);
        categoryField.setStringValue(category);
        contentField.setStringValue(parts[1]);
        d.add(id);
        d.add(categoryField);
        d.add(contentField);
        
        writer.addDocument(d); //
      }
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
      /*
      Collect Content
      Build Document
Solution content
      
      writer.addDocument(d); //

      /*
      Collect Content
      Build Document
File
TrainMoreLikeThis.java
Developer's decision
Version 1
Kind of conflict
Comment
Method invocation
Variable
Chunk
Conflicting content
      log.info("KNN: Added document for category " + category + " named " + inputFile.getName());
    }
<<<<<<< HEAD
=======
    writer.setCommitData(generateUserData(categories));
    writer.commit();
    
    log.info("Knn: Added " + fileCount + " categories in " + (System.currentTimeMillis() - start) + " msec.");
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
  }

  /** builds a lucene index suitable for tfidf based classification. Each categories content is indexed into
Solution content
      log.info("KNN: Added document for category " + category + " named " + inputFile.getName());
    }
  }

  /** builds a lucene index suitable for tfidf based classification. Each categories content is indexed into
File
TrainMoreLikeThis.java
Developer's decision
Version 1
Kind of conflict
Method invocation
Chunk
Conflicting content
   */
  public static class TfidfIndexer extends IndexingCallback {
    
<<<<<<< HEAD
    // holds the collected content for each category
    final Map content = new HashMap();

    public TfidfIndexer(String dest, int ngramSize) throws IOException {
      super(dest, ngramSize);
    }
=======
    // reuse these fields
    Field id = new StringField("id", "", Field.Store.YES);
    Field categoryField = new StringField("category", "", Field.Store.YES);
    FieldType fieldType = new FieldType();
    fieldType.setIndexed(true);
    fieldType.setStoreTermVectorOffsets(true);
    fieldType.setStoreTermVectorPositions(true);
    Field contentField = new Field("content", "", fieldType);
    
    // read data from input files.
    
    
    for (File ff: inputFiles) {
      fileCount++;
      lineCount = 0;
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6

    @Override
    public void process(String label, File inputFile) {
Solution content
   */
  public static class TfidfIndexer extends IndexingCallback {
    
    // holds the collected content for each category
    final Map content = new HashMap();

    public TfidfIndexer(String dest, int ngramSize) throws IOException {
      super(dest, ngramSize);
    }

    @Override
    public void process(String label, File inputFile) {
File
TrainMoreLikeThis.java
Developer's decision
Version 1
Kind of conflict
Attribute
Comment
For statement
Method declaration
Method invocation
Variable
Chunk
Conflicting content
        d.add(categoryField);
        d.add(contentField);

<<<<<<< HEAD
        writer.addDocument(d); //
        
        /*
          Collect Content
          Build Document
          Index Document
         */
        //

        log.info("TfIdf: Added document for category " + category);
      }

      super.close();
    }
  }

=======
      Document d = new Document(); //
      id.setStringValue(category + "-" + lineCount);
      categoryField.setStringValue(category);
      contentField.setStringValue(content.toString());
      d.add(id);
      d.add(categoryField);
      d.add(contentField);
      
      writer.addDocument(d); //
      /*
        Collect Content
        Build Document
        Index Document
       */
      //
      
      log.info("TfIdf: Added document for category " + category + " with " + lineCount + " lines");
    }
    writer.setCommitData(generateUserData(categories));
    writer.commit();
    
    log.info("TfIdf: Added " + fileCount + " categories in " + (System.currentTimeMillis() - start) + " msec.");
  }

  
  
  protected void openIndexWriter(String pathname) throws IOException {
    //
    Directory directory //
      = FSDirectory.open(new File(pathname));
    Analyzer analyzer   //
      = new EnglishAnalyzer(Version.LUCENE_47);
    
    if (nGramSize > 1) { //
      ShingleAnalyzerWrapper sw 
        = new ShingleAnalyzerWrapper(analyzer,
            nGramSize, // min shingle size
            nGramSize, // max shingle size
            "-",       // token separator
            true,      // output unigrams
            true);     // output unigrams if no shingles
      analyzer = sw;
    }
    
    IndexWriterConfig config //
      = new IndexWriterConfig(Version.LUCENE_47, analyzer);
    config.setOpenMode(OpenMode.CREATE);
    IndexWriter writer =  new IndexWriter(directory, config);
    /* 
    Create Index Directory
    Setup Analyzer
    Setup Shingle Filter
    Create IndexWriter
     */
    //
    this.writer = writer;
  }

  protected void closeIndexWriter() throws IOException {
    writer.close();
    writer = null;
  }
  
>>>>>>> 678d92be45331bdd660b941831296676aa1233e6
  protected static Map generateUserData(Collection categories) {
    StringBuilder b = new StringBuilder();
    for (String cat: categories) {
Solution content
        d.add(categoryField);
        d.add(contentField);

        writer.addDocument(d); //
        
        /*
          Collect Content
          Build Document
          Index Document
         */
        //

        log.info("TfIdf: Added document for category " + category);
      }

      super.close();
    }
  }

  protected static Map generateUserData(Collection categories) {
    StringBuilder b = new StringBuilder();
    for (String cat: categories) {
File
TrainMoreLikeThis.java
Developer's decision
Version 1
Kind of conflict
Comment
Method declaration
Method invocation
Variable