使用hadoop2.3+mahout+lucene将文本换算成vector

来源:转载


在http://www.daviddlewis.com/resources/testcollections/reuters21578/下载Reuters数据

参照:http://www.shellsec.com/tech/63646.html和http://blog.chinaunix.net/uid-20761674-id-3535501.html


跑的job:

  1. 将文本文件转换为sequencefile
  2. 分词(DocumentProcessor::DocumentTokenizer)
  3. 统计词频(DictionaryVectorizer::WordCount)
  4. 生成分区向量空间(DictionaryVectorizer::MakePartialVectors)
  5. 合并分区向量空间(PartialVectorMerger)
  6. 用tfidf算法计算文档向量空间各分量的权重(VectorTfIdf Document Frequency Count )
  7. 优化向量空间(Prune Vectors)——将维
  8. 合并优化后的向量空间(PrunerPartialVectorMerger)
  9. 再次生成分区向量空间(MakePartialVectors)
  10. 再次合并分区向量空间(PartialVectorMerge)
  11. 聚类(Cluster Iterator running)
  12. 分类(Cluster Classification Driver running)
  13. 输出聚类结果(Representative Points Driver running)


生成的向量化文件的目录结构是这样的:

  • df-count 目录:保存着文本的频率信息
  • tf-vectors 目录:保存着以 TF 作为权值的文本向量
  • tfidf-vectors 目录:保存着以 TFIDF 作为权值的文本向量
  • tokenized-documents 目录:保存着分词过后的文本信息
  • wordcount 目录:保存着全局的词汇出现的次数
  • dictionary.file-0 目录:保存着这些文本的词汇表
  • frequcency-file-0 目录 : 保存着词汇表对应的频率信息。

在eclipse中运行需要以下的关于lucene的jar包

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-benchmark</artifactId>
<version>4.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>4.6.1</version>
</dependency>

解压缩reuters数据

public static void extractReuters(){ 
File inputFolder = new File("datafile/reuters"); 
File outputFolder = new File("datafile/reuters-extracted"); 
ExtractReuters extractor = new ExtractReuters(inputFolder, outputFolder); 
extractor.extract(); 


将解压后的reuters数据转化成SequenceFile
public static void transformToSequenceFile(){ 
Configuration config = BasicConfig.config();
HdfsUtils hdfs = new HdfsUtils(BasicConfig.HDFS, config); 


String[] mahoutJars = {//local jars     mahout和lucene的相关jar包用于传到hadoop上的classpath中,是local的jar包
"/home/training/git/socialrecommendation/datafile/mahout-math-1.0-SNAPSHOT.jar",
"/home/training/git/socialrecommendation/datafile/lucene-analyzers-common-4.6.1.jar",
"/home/training/git/socialrecommendation/datafile/mahout-integration-1.0-SNAPSHOT.jar",
"/home/training/git/socialrecommendation/datafile/mahout-mrlegacy-1.0-SNAPSHOT.jar",
"/home/training/git/socialrecommendation/datafile/mahout-mrlegacy-1.0-SNAPSHOT-job.jar" ,
};

try {
hdfs.addJarToDistributedCache(Arrays.asList(mahoutJars), config);
} catch (IOException e1) {
e1.printStackTrace();
}
String[] args = {"-c", "UTF-8", "-i", BasicConfig.HDFS+"/user/hdfs/userCF/reutersExtracted", "-o",
BasicConfig.HDFS+"/user/hdfs/userCF/reutersSeqfiles"};   //在集群上运行的参数
 
/*String[] args = {"-c", "UTF-8", "-i", "datafile/reuters-extracted/", "-o",
"datafile/reuters-seqfiles"};*/                                                 //在本地运行的参数

try { 
/*SequenceFilesFromDirectory.main(args);*/   //在本地运行的参数
SequenceFilesFromDirectory job = new SequenceFilesFromDirectory();
job.main(args, config);
} catch (Exception e) { 
e.printStackTrace(); 

想要让这个job在hadoop集群上跑,

需要将64行

public static void main(String[] args) throws Exception {
    ToolRunner.run(new SequenceFilesFromDirectory(), args);
  }

改成

private static Configuration conf;

public static void main(String[] args, Configuration config) throws Exception {
conf = config;
    ToolRunner.run(new SequenceFilesFromDirectory(), args);
  }


将84行:HadoopUtil.delete(getConf(), output);

改成 HadoopUtil.delete(conf, output);

将89行://runSequential(getConf(), getInputPath(), output, options);

改成:runSequential(conf, getInputPath(), output, options);

将153行:    

     Job job = prepareJob(input, output, MultipleTextFileInputFormat.class,
      SequenceFilesFromDirectoryMapper.class, Text.class, Text.class,
      SequenceFileOutputFormat.class, "SequenceFilesFromDirectory", conf);

改成

    Job job = prepareJob(input, output, MultipleTextFileInputFormat.class,
      SequenceFilesFromDirectoryMapper.class, Text.class, Text.class,
      SequenceFileOutputFormat.class, "SequenceFilesFromDirectory");

进行以上改变主要是为了能够将configuration的配置信息传到job上,好像该方法只能用于进行本地跑

另外在AbstractJob.java类中添加方法

  protected Job prepareJob(Path inputPath,
         Path outputPath,
         Class<? extends InputFormat> inputFormat,
         Class<? extends Mapper> mapper,
         Class<? extends Writable> mapperKey,
         Class<? extends Writable> mapperValue,
         Class<? extends OutputFormat> outputFormat,
         String jobname, Configuration conf) throws IOException {

Job job = HadoopUtil.prepareJob(inputPath, outputPath,
inputFormat, mapper, mapperKey, mapperValue, outputFormat, conf);

String name =
jobname != null ? jobname : HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class);

job.setJobName(name);
return job;
}


将SequenceFile进行向量化

public static void transformToVector(Long l){ 
 
Configuration config = BasicConfig.config();
HdfsUtils hdfs = new HdfsUtils(BasicConfig.HDFS, config);


String[] mahoutJars = {//local jars
"/home/training/git/socialrecommendation/datafile/mahout-math-1.0-SNAPSHOT.jar",

"/home/training/git/socialrecommendation/datafile/lucene-analyzers-common-4.6.1.jar",

"/home/training/git/socialrecommendation/datafile/mahout-integration-1.0-SNAPSHOT.jar",

"/home/training/git/socialrecommendation/datafile/mahout-mrlegacy-1.0-SNAPSHOT.jar",

"/home/training/git/socialrecommendation/datafile/mahout-mrlegacy-1.0-SNAPSHOT-job.jar" ,

};


try {
hdfs.addJarToDistributedCache(Arrays.asList(mahoutJars), config);
} catch (IOException e1) {
e1.printStackTrace();
}

String[] args = {"-a", "org.apache.lucene.analysis.core.WhitespaceAnalyzer", 
"-chunk", "200","-o", BasicConfig.HDFS+"/user/hdfs/userCF/"+l+"/reutersVectorsBigram", 
"-i", BasicConfig.HDFS+"/user/hdfs/userCF/reutersSeqfiles/", "-md", "3",
"-x", "90", "-wt", "tfidf", "-ml", "50","-ng", "2", 
 "-seq"};
try { 
SparseVectorsFromSequenceFiles job = new SparseVectorsFromSequenceFiles();
job.main(args,config); 
} catch (Exception e) { 
e.printStackTrace(); 



在类SparseVectorsFromSequenceFiles的方法

将54行

  private static Configuration conf;
  public static void main(String[] args) throws Exception {
    ToolRunner.run(new SparseVectorsFromSequenceFiles(), args);
  }

改成

  private static Configuration conf;
  public static void main(String[] args,Configuration config) throws Exception {
 conf=config;
    ToolRunner.run(new SparseVectorsFromSequenceFiles(), args);
  }

把253行:去掉Configuration conf = getConf();

进行以上改变主要是为了能够将configuration的配置信息传到job上,好像该方法只能用于进行本地跑


在HighDFWordsPruner类  

第82行:    DistributedCache.setCacheFiles(new URI[]{dictionaryFilePath.toUri()}, conf);

改成    DistributedCache.addCacheFileAsFirstOne(dictionaryFilePath.toUri(), conf);

在DistributedCache类中

添加方法:

  /**
   * Add a file to be localized to the conf.  Intended
   * to be used by user code.
   * @param uri The uri of the cache to be localized
   * @param conf Configuration to add the cache to
   * @deprecated Use {@link Job#addCacheFile(URI)} instead
   */
  @Deprecated
  public static void addCacheFileAsFirstOne(URI uri, Configuration conf) {
    String files = conf.get(MRJobConfig.CACHE_FILES);
    conf.set(MRJobConfig.CACHE_FILES, files == null ? uri.toString() :  uri.toString()+ ","
             + files);
  }

由于使用set方法会把之前的放在classpath下的jar包的路径给删掉,因此使用add方法,并且需要改成添加成第一个路径,

因为在之后取的时候会只取第一个路径,因此不放在第一个会报错


其中:

这个是yarn的配置信息:

package com.hp.recommendation.util;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.yarn.conf.YarnConfiguration;

public class BasicConfig {

    public static final String HDFS = "hdfs://c0004649.itcs.hp.com:9000";

    public static final String YARN_RESOURCE="c0004650.itcs.hp.com"; 

public static Configuration config() {

        Configuration conf = new YarnConfiguration();
        conf.set("fs.defaultFS", BasicConfig.HDFS);
        conf.set("mapreduce.framework.name", "yarn");
        conf.set("yarn.resourcemanager.address", BasicConfig.YARN_RESOURCE+":8032");
        conf.set("yarn.resourcemanager.scheduler.address", BasicConfig.YARN_RESOURCE+":8030");
        //put all of the third party jars like this, but these jars should be put on hadoop clusters.
        //conf.set("mapreduce.application.classpath", "$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,/opt/mount/learn/mahout-1.0-lib/*");将第三方jar包放到hadoop集群的每个节点中比较麻烦
        return conf;
    }


}

这个是如何操作hdfs文件的使用方法:

package com.hp.recommendation.util;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;

import com.hp.recommendation.model.WriteDataToHDFSModel;

/**
 * 
 * @author shijie
 * ref:http://blog.fens.me/hadoop-mahout-mapreduce-itemcf/
 */
public class HdfsUtils {
    private static final String HDFS = "hdfs://c0004649.itcs.hp.com:9000";

    public HdfsUtils(Configuration conf) {
        this(HDFS, conf);
    }

    public HdfsUtils(String hdfs, Configuration conf) {
        this.hdfsPath = hdfs;
        this.conf = conf;
    }


    private String hdfsPath;
    private Configuration conf;
    public static void getConf() {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", HDFS);
        FileSystem hdfs;
        try {
            hdfs = FileSystem.get(conf);
            FileStatus[] fs = hdfs.listStatus(new Path("/"));
            for (int i = 0; i < fs.length; i++) {
                System.out.println(fs[i].toString());
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public void mkdirs(String folder) throws IOException {
        Path path = new Path(folder);
        FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
        if (!fs.exists(path)) {
            fs.mkdirs(path);
            System.out.println("Create: " + folder);
        }
        fs.close();
    }

    public void rmr(String folder) throws IOException {
        Path path = new Path(folder);
        FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
        fs.deleteOnExit(path);
        System.out.println("Delete: " + folder);
        fs.close();
    }

    public void ls(String folder) throws IOException {
        Path path = new Path(folder);
        FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
        FileStatus[] list = fs.listStatus(path);
        System.out.println("ls: " + folder);
        System.out.println("==========================================================");
        for (FileStatus f : list) {
            System.out.printf("name: %s, folder: %s, size: %d/n", f.getPath(), f.isDir(), f.getLen());
        }
        System.out.println("==========================================================");
        fs.close();
    }

    public void createFile(String file, String content) throws IOException {
        FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
        byte[] buff = content.getBytes();
        FSDataOutputStream os = null;
        try {
            os = fs.create(new Path(file));
            os.write(buff, 0, buff.length);
            System.out.println("Create: " + file);
        } finally {
            if (os != null)
                os.close();
        }
        fs.close();
    }






    public void copyFile(String local, String remote) throws IOException {
        FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
        fs.copyFromLocalFile(new Path(local), new Path(remote));
        System.out.println("copy from: " + local + " to " + remote);
        fs.close();
    }

    public void download(String remote, String local) throws IOException {
        Path path = new Path(remote);
        FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
        fs.copyToLocalFile(path, new Path(local));
        System.out.println("download: from" + remote + " to " + local);
        fs.close();
    }
    public void cat(String remoteFile) throws IOException {
        Path path = new Path(remoteFile);
        FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf)
        FSDataInputStream fsdis = null;
        System.out.println("cat: " + remoteFile);
        try {  
            fsdis =fs.open(path);
            IOUtils.copyBytes(fsdis, System.out, 4096, false);  
          } finally {  
            IOUtils.closeStream(fsdis);
            fs.close();
          }
    }

    public String getFile(String remoteFile) throws IOException {

        Path path = new Path(remoteFile);
        FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
        FSDataInputStream fsdis = null;
        System.out.println("cat: " + remoteFile);
        BufferedInputStream buffer = null;
        ByteArrayOutputStream outStream = new ByteArrayOutputStream();
        String str = null;

        try {
            fsdis =fs.open(path);
            buffer = new BufferedInputStream(fsdis);
            int BUFFER_SIZE = 4096;
            byte[] data = new byte[BUFFER_SIZE];
            int count = -1;
            while((count = buffer.read(data, 0, BUFFER_SIZE)) != -1){
            outStream.write(data, 0, count);
            }
            data = null;
            str = new String(outStream.toByteArray());
        } finally { 
        buffer.close();
        outStream.close();
            fs.close();
        }
        return str;
    }


    


    public void writeFileToHDFS(String source, String dest) throws IOException {

        FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
    // Get the filename out of the file path
    String filename = source.substring(source.lastIndexOf('/') + 1, source.length());

    // Create the destination path including the filename.
    if (dest.charAt(dest.length() - 1) != '/') {
    dest = dest + "/" + filename;
    } else {
    dest = dest + filename;
    }

    // Check if the file already exists
    Path path = new Path(dest);
    if (fs.exists(path)) {
    System.out.println("File " + dest + " already exists");
    return;
    }

    // Create a new file and write data to it.
    FSDataOutputStream out = fs.create(path);
    InputStream in = new BufferedInputStream(new FileInputStream(
    new File(source)));

    byte[] b = new byte[1024];
    int numBytes = 0;
    while ((numBytes = in.read(b)) > 0) {
    out.write(b, 0, numBytes);

    }

    // Close all the file descripters
    in.close();
    out.close();

    fs.close();

    }


    public WriteDataToHDFSModel getFileSystemAndFSDataOutputStream(String file) throws IOException {

        FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);

    // Check if the file already exists
    Path path = new Path(file);
    WriteDataToHDFSModel model = new WriteDataToHDFSModel();
    if (fs.exists(path)) {
    System.out.println("File " + file + " already exists");
    FSDataOutputStream out = fs.append(path);
    model.setOut(out);
    }else{
// Create a new file and write data to it.
    FSDataOutputStream out = fs.create(path);
    model.setOut(out);
    }

    model.setFs(fs);
    return model;

    }

    public void writeStringToFile(String content, String file) throws IOException {

        FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
    // Check if the file already exists
    Path path = new Path(file);
    if (fs.exists(path)) {
    System.out.println("File " + file + " already exists");
    FSDataOutputStream out = fs.append(path);
    BufferedOutputStream buffer = new BufferedOutputStream(out);
    buffer.write(content.getBytes());
    buffer.flush();
    // Close all the file descripters   
    buffer.close();
    out.close();
    }else{
// Create a new file and write data to it.
    FSDataOutputStream out = fs.create(path);
    BufferedOutputStream buffer = new BufferedOutputStream(out);
    buffer.write(content.getBytes());
    buffer.flush();
    // Close all the file descripters    
    buffer.close();
    out.close();
    }

    fs.close();
    }

    public static void addJarToDistributedCache(Class classToAdd,Configuration conf) throws IOException {


// Retrieve jar file for class2Add
String jar = classToAdd.getProtectionDomain().getCodeSource().getLocation().getPath();
System.out.println("jar=" + jar);
File jarFile = new File(jar);
// Declare new HDFS location
Path hdfsJar = new Path("/user/hadoop/lib/mahout/" + jarFile.getName());

// Mount HDFS
FileSystem hdfs = FileSystem.get(conf);

// Copy (override) jar file to HDFS
hdfs.copyFromLocalFile(false, true, new Path(jar), hdfsJar);

// Add jar to distributed classPath
DistributedCache.addFileToClassPath(hdfsJar, conf);
}

    //add jars to classpath by jar path
public static void addJarToDistributedCache(List<String> jarPaths,Configuration conf) throws IOException {
// Mount HDFS
FileSystem hdfs = FileSystem.get(conf);
for (String jar : jarPaths) {
File jarFile = new File(jar);

// Declare new HDFS location
Path hdfsJar = new Path("/user/hadoop/lib/mahout/" + jarFile.getName());

// Copy (override) jar file to HDFS
if (!hdfs.exists(hdfsJar)) {
hdfs.copyFromLocalFile(false, true, new Path(jar), hdfsJar);

}
// Add jar to distributed classPath
DistributedCache.addFileToClassPath(hdfsJar, conf);

}

}

}






分享给朋友:
您可能感兴趣的文章:
随机阅读: