注册 登录  
 加关注
   显示下一条  |  关闭
温馨提示!由于新浪微博认证机制调整,您的新浪微博帐号绑定已过期,请重新绑定!立即重新绑定新浪微博》  |  关闭

anqiang专栏

不要问细节是怎么搞的,源码说明一切

 
 
 

日志

 
 

BayesFileFormatter源码注释  

2010-04-02 17:24:11|  分类: Hadoop & Mahout |  标签: |举报 |字号 订阅

  下载LOFTER 我的照片书  |

 这个类得主要功能是:讲本地的一个文件夹内的数据文件转换为bayes M/R job可以读取的数据格式。

有两种实现:一种是将本地文件夹内的数据写入到一个输出文件夹内,每个输入文件对应一个输出文件;另一种是将本地文件夹内的数据写入到一个大的数据输出文件中。

输出的数据格式如下

数据类别(label) \t  termOne (空格) termTwo(空格) ...

 

package org.apache.mahout.classifier;

 

import org.apache.commons.cli2.CommandLine;

import org.apache.commons.cli2.Group;

import org.apache.commons.cli2.Option;

import org.apache.commons.cli2.OptionException;

import org.apache.commons.cli2.builder.ArgumentBuilder;

import org.apache.commons.cli2.builder.DefaultOptionBuilder;

import org.apache.commons.cli2.builder.GroupBuilder;

import org.apache.commons.cli2.commandline.Parser;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.Token;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.mahout.common.IOUtils;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

 

import java.io.File;

import java.io.FileFilter;

import java.io.FileInputStream;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStreamReader;

import java.io.OutputStreamWriter;

import java.io.Reader;

import java.io.Writer;

import java.nio.charset.Charset;

import java.util.ArrayList;

import java.util.List;

 

/**

 * Flatten a file into format that can be read by the Bayes M/R job. <p/> One document per line, first token is the

 * label followed by a tab, rest of the line are the terms.

 *

 * terms 之间用空格分开

 *

 */

public class BayesFileFormatter {

 

  private static final Logger log = LoggerFactory.getLogger(BayesFileFormatter.class);

 

  //行分割符

  private static final String LINE_SEP = System.getProperty("line.separator");

 

  private BayesFileFormatter() {

  }

 

  /**

   * Collapse all the files in the inputDir into a single file in the proper Bayes format, 1 document per line

   *

   * @param label      The label

   * @param analyzer   The analyzer to use 分词工具

   * @param inputDir   The input Directory 输入文件夹

   * @param charset    The charset of the input files 字符集编码

   * @param outputFile The file to collapse to

   */

  public static void collapse(String label, Analyzer analyzer, File inputDir,

                              Charset charset, File outputFile) throws IOException {

    Writer writer = new OutputStreamWriter(new FileOutputStream(outputFile),

        charset);

    try {

      inputDir.listFiles(new FileProcessor(label, analyzer, charset, writer));

      // listFiles() is called here as a way to recursively visit files, actually

    } finally {

      IOUtils.quietClose(writer);

    }

  }

 

  /**

   * Write the input files to the outdir, one output file per input file

   *

   * @param label    The label of the file

   * @param analyzer The analyzer to use

   * @param input    The input file or directory. May not be null

   * @param charset  The Character set of the input files

   * @param outDir   The output directory. Files will be written there with the same name as the input file

   */

  public static void format(String label, Analyzer analyzer, File input,

                            Charset charset, File outDir) throws IOException {

    if (input.isDirectory()) {

      input.listFiles(new FileProcessor(label, analyzer, charset, outDir));

    } else {

      Writer writer = new OutputStreamWriter(new FileOutputStream(new File(

          outDir, input.getName())), charset);

      try {

        writeFile(label, analyzer, input, charset, writer);

      } finally {

        IOUtils.quietClose(writer);

      }

    }

  }

 

  /**

   * Hack the FileFilter mechanism so that (we don't get stuck on large directories and don't have to loop the list

   * twice)

   *

   * accept()函数中调用了writeFile函数将各个文件中的信息写入到一个大文件中

   * 这有点显得功能过耦合

   *

   * 主要是为了避免对巨大文件夹的二次循环读取

   */

  private static class FileProcessor implements FileFilter {

    private final String label;

 

    private final Analyzer analyzer;

 

    //需要写入的文件来源dir

    private File outputDir;

 

    private final Charset charset;

 

    //写入的目标writer

    private Writer writer;

 

    /**

     * Use this when you want to collapse all files to a single file

     *

     * @param label  The label

     * @param writer must not be null and will not be closed

     */

    private FileProcessor(String label, Analyzer analyzer, Charset charset,

                          Writer writer) {

      this.label = label;

      this.analyzer = analyzer;

      this.charset = charset;

      this.writer = writer;

    }

 

    /**

     * Use this when you want a writer per file

     *

     * @param outputDir must not be null.

     */

    private FileProcessor(String label, Analyzer analyzer, Charset charset,

                          File outputDir) {

      this.label = label;

      this.analyzer = analyzer;

      this.charset = charset;

      this.outputDir = outputDir;

    }

 

    @Override

    public boolean accept(File file) {

      if (file.isFile()) {

        Writer theWriter = null;

        try {

          if (writer == null) {

            theWriter = new OutputStreamWriter(new FileOutputStream(new File(

                outputDir, file.getName())), charset);

          } else {

            theWriter = writer;

          }

         

          //只要是文件就写入文件内容

          writeFile(label, analyzer, file, charset, theWriter);

          if (writer != null) {

            // just write a new line

            theWriter.write(LINE_SEP);

          }

        } catch (IOException e) {

          // TODO: report failed files instead of throwing exception

          throw new IllegalStateException(e);

        } finally {

          if (writer == null) {

            IOUtils.quietClose(theWriter);

          }

        }

      } else {

        file.listFiles(this);

      }

      return false;

    }

  }

 

  /**

   * Write the tokens and the label from the Reader to the writer

   *

   * 将一个文件按照一定的字符编码格式写入到另外一个文件中writer

   * 格式如下:

   * label \t term1 term2 ...

   *

   *

   * @param label    The label

   * @param analyzer The analyzer to use

   * @param inFile   the file to read and whose contents are passed to the analyzer

   * @param charset  character encoding to assume when reading the input file

   * @param writer   The Writer, is not closed by this method

   * @throws java.io.IOException if there was a problem w/ the reader

   */

  private static void writeFile(String label, Analyzer analyzer, File inFile,

                                Charset charset, Writer writer) throws IOException {

    Reader reader = new InputStreamReader(new FileInputStream(inFile), charset);

    try {

      TokenStream ts = analyzer.tokenStream(label, reader);

      writer.write(label);

      writer.write('\t'); // edit: Inorder to match Hadoop standard

     

      // TextInputFormat

      Token token = new Token();

      while ((token = ts.next(token)) != null) {

        char[] termBuffer = token.termBuffer();

        int termLen = token.termLength();

        writer.write(termBuffer, 0, termLen);

       

        //terms 之间用空格分开

        writer.write(' ');

      }

    } finally {

      IOUtils.quietClose(reader);

    }

  }

 

  /**

   * Convert a Reader to a vector

   *

   * 将一个reader的内容进行分词并读取到字符串数组中

   *

   * @param analyzer The Analyzer to use

   * @param reader   The reader to feed to the Analyzer

   * @return An array of unique tokens

   */

  public static String[] readerToDocument(Analyzer analyzer, Reader reader)

      throws IOException {

    TokenStream ts = analyzer.tokenStream("", reader);

 

    Token token;

    List<String> coll = new ArrayList<String>();

    while ((token = ts.next()) != null) {

      char[] termBuffer = token.termBuffer();

      int termLen = token.termLength();

      String val = new String(termBuffer, 0, termLen);

      coll.add(val);

    }

    return coll.toArray(new String[coll.size()]);

  }

 

 

}

  评论这张
 
阅读(902)| 评论(0)
推荐 转载

历史上的今天

评论

<#--最新日志,群博日志--> <#--推荐日志--> <#--引用记录--> <#--博主推荐--> <#--随机阅读--> <#--首页推荐--> <#--历史上的今天--> <#--被推荐日志--> <#--上一篇,下一篇--> <#-- 热度 --> <#-- 网易新闻广告 --> <#--右边模块结构--> <#--评论模块结构--> <#--引用模块结构--> <#--博主发起的投票-->
 
 
 
 
 
 
 
 
 
 
 
 
 
 

页脚

网易公司版权所有 ©1997-2017