本文轉載至:
http://www.aboutyun.com/thread-7358-1-1.html
hadoop涉及輸出文本的默認輸出編碼統一用沒有BOM的UTF-8的形式,但是對於中文的輸出window系統默認的是GBK,有些格式文件例如CSV格式的文件用excel打開輸出編碼為沒有BOM的UTF-8文件時,輸出的結果為亂碼,只能由UE或者記事本打開才能正常顯示。因此將hadoop默認輸出編碼更改為GBK成為非常常見的需求。
默認的情況下MR主程序中,設定輸出編碼的設置語句為:
- job.setOutputFormatClass(TextOutputFormat.class);
- TextOutputFormat.class
的代碼如下:
- /**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.hadoop.mapreduce.lib.output;
- import java.io.DataOutputStream;
- import java.io.IOException;
- import java.io.UnsupportedEncodingException;
- import org.apache.hadoop.classification.InterfaceAudience;
- import org.apache.hadoop.classification.InterfaceStability;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.fs.FSDataOutputStream;
- import org.apache.hadoop.io.NullWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.io.compress.CompressionCodec;
- import org.apache.hadoop.io.compress.GzipCodec;
- import org.apache.hadoop.mapreduce.OutputFormat;
- import org.apache.hadoop.mapreduce.RecordWriter;
- import org.apache.hadoop.mapreduce.TaskAttemptContext;
- import org.apache.hadoop.util.*;
- /** An {@link OutputFormat} that writes plain text files. */
- @InterfaceAudience.Public
- @InterfaceStability.Stable
- public class TextOutputFormat<K, V> extends FileOutputFormat<K, V> {
- public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";
- protected static class LineRecordWriter<K, V>
- extends RecordWriter<K, V> {
- private static final String utf8 = "UTF-8"; // 將UTF-8轉換成GBK
- private static final byte[] newline;
- static {
- try {
- newline = "\n".getBytes(utf8);
- } catch (UnsupportedEncodingException uee) {
- throw new IllegalArgumentException("can't find " + utf8 + " encoding");
- }
- }
- protected DataOutputStream out;
- private final byte[] keyValueSeparator;
- public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
- this.out = out;
- try {
- this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
- } catch (UnsupportedEncodingException uee) {
- throw new IllegalArgumentException("can't find " + utf8 + " encoding");
- }
- }
- public LineRecordWriter(DataOutputStream out) {
- this(out, "\t");
- }
- /**
- * Write the object to the byte stream, handling Text as a special
- * case.
- * @param o the object to print
- * @throws IOException if the write throws, we pass it on
- */
- private void writeObject(Object o) throws IOException {
- if (o instanceof Text) {
- Text to = (Text) o; // 將此行代碼注釋掉
- out.write(to.getBytes(), 0, to.getLength()); // 將此行代碼注釋掉
- } else { // 將此行代碼注釋掉
- out.write(o.toString().getBytes(utf8));
- }
- }
- public synchronized void write(K key, V value)
- throws IOException {
- boolean nullKey = key == null || key instanceof NullWritable;
- boolean nullValue = value == null || value instanceof NullWritable;
- if (nullKey && nullValue) {
- return;
- }
- if (!nullKey) {
- writeObject(key);
- }
- if (!(nullKey || nullValue)) {
- out.write(keyValueSeparator);
- }
- if (!nullValue) {
- writeObject(value);
- }
- out.write(newline);
- }
- public synchronized
- void close(TaskAttemptContext context) throws IOException {
- out.close();
- }
- }
- public RecordWriter<K, V>
- getRecordWriter(TaskAttemptContext job
- ) throws IOException, InterruptedException {
- Configuration conf = job.getConfiguration();
- boolean isCompressed = getCompressOutput(job);
- String keyValueSeparator= conf.get(SEPERATOR, "\t");
- CompressionCodec codec = null;
- String extension = "";
- if (isCompressed) {
- Class<? extends CompressionCodec> codecClass =
- getOutputCompressorClass(job, GzipCodec.class);
- codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
- extension = codec.getDefaultExtension();
- }
- Path file = getDefaultWorkFile(job, extension);
- FileSystem fs = file.getFileSystem(conf);
- if (!isCompressed) {
- FSDataOutputStream fileOut = fs.create(file, false);
- return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
- } else {
- FSDataOutputStream fileOut = fs.create(file, false);
- return new LineRecordWriter<K, V>(new DataOutputStream
- (codec.createOutputStream(fileOut)),
- keyValueSeparator);
- }
- }
- }
從上述代碼的第48行可以看出hadoop已經限定此輸出格式統一為UTF-8,因此為了改變hadoop的輸出代碼的文本編碼只需定義一個和TextOutputFormat相同的類GbkOutputFormat同樣繼承FileOutputFormat(注意是org.apache.hadoop.mapreduce.lib.output.FileOutputFormat)即可,如下代碼:
- import java.io.DataOutputStream;
- import java.io.IOException;
- import java.io.UnsupportedEncodingException;
- import org.apache.hadoop.classification.InterfaceAudience;
- import org.apache.hadoop.classification.InterfaceStability;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.fs.FSDataOutputStream;
- import org.apache.hadoop.io.NullWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.io.compress.CompressionCodec;
- import org.apache.hadoop.io.compress.GzipCodec;
- import org.apache.hadoop.mapreduce.OutputFormat;
- import org.apache.hadoop.mapreduce.RecordWriter;
- import org.apache.hadoop.mapreduce.TaskAttemptContext;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.util.*;
- @InterfaceAudience.Public
- @InterfaceStability.Stable
- public class GbkOutputFormat<K, V> extends FileOutputFormat<K, V> {
- public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";
- protected static class LineRecordWriter<K, V>
- extends RecordWriter<K, V> {
- private static final String utf8 = "GBK";
- private static final byte[] newline;
- static {
- try {
- newline = "\n".getBytes(utf8);
- } catch (UnsupportedEncodingException uee) {
- throw new IllegalArgumentException("can't find " + utf8 + " encoding");
- }
- }
- protected DataOutputStream out;
- private final byte[] keyValueSeparator;
- public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
- this.out = out;
- try {
- this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
- } catch (UnsupportedEncodingException uee) {
- throw new IllegalArgumentException("can't find " + utf8 + " encoding");
- }
- }
- public LineRecordWriter(DataOutputStream out) {
- this(out, "\t");
- }
- /**
- * Write the object to the byte stream, handling Text as a special
- * case.
- * @param o the object to print
- * @throws IOException if the write throws, we pass it on
- */
- private void writeObject(Object o) throws IOException {
- if (o instanceof Text) {
- // Text to = (Text) o;
- // out.write(to.getBytes(), 0, to.getLength());
- // } else {
- out.write(o.toString().getBytes(utf8));
- }
- }
- public synchronized void write(K key, V value)
- throws IOException {
- boolean nullKey = key == null || key instanceof NullWritable;
- boolean nullValue = value == null || value instanceof NullWritable;
- if (nullKey && nullValue) {
- return;
- }
- if (!nullKey) {
- writeObject(key);
- }
- if (!(nullKey || nullValue)) {
- out.write(keyValueSeparator);
- }
- if (!nullValue) {
- writeObject(value);
- }
- out.write(newline);
- }
- public synchronized
- void close(TaskAttemptContext context) throws IOException {
- out.close();
- }
- }
- public RecordWriter<K, V>
- getRecordWriter(TaskAttemptContext job
- ) throws IOException, InterruptedException {
- Configuration conf = job.getConfiguration();
- boolean isCompressed = getCompressOutput(job);
- String keyValueSeparator= conf.get(SEPERATOR, "\t");
- CompressionCodec codec = null;
- String extension = "";
- if (isCompressed) {
- Class<? extends CompressionCodec> codecClass =
- getOutputCompressorClass(job, GzipCodec.class);
- codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
- extension = codec.getDefaultExtension();
- }
- Path file = getDefaultWorkFile(job, extension);
- FileSystem fs = file.getFileSystem(conf);
- if (!isCompressed) {
- FSDataOutputStream fileOut = fs.create(file, false);
- return new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
- } else {
- FSDataOutputStream fileOut = fs.create(file, false);
- return new LineRecordWriter<K, V>(new DataOutputStream
- (codec.createOutputStream(fileOut)),
- keyValueSeparator);
- }
- }
- }
最后將輸出編碼類型設置成GbkOutputFormat.class,如:
- job.setOutputFormatClass(GbkOutputFormat.class);
參考:
- http://semantic.iteye.com/blog/1846238