【hadoop】python通過hdfs模塊讀hdfs數據


hdfs官網:http://hdfscli.readthedocs.io/en/latest/api.html

一個非常好的博客:http://blog.csdn.net/gamer_gyt/article/details/52446757

 

hdfs庫中自帶avro序列化與反序列化模塊,不需要單獨做

#!/usr/bin/env python
# encoding: utf-8

"""Avro extension example."""

from hdfs import Config
from hdfs.ext.avro import AvroReader, AvroWriter


# Get the default alias' client.
client = Config().get_client()

# Some sample data.
records = [
  {'name': 'Ann', 'age': 23},
  {'name': 'Bob', 'age': 22},
]

# Write an Avro File to HDFS (since our records' schema is very simple, we let
# the writer infer it automatically, otherwise we would pass it as argument).
with AvroWriter(client, 'names.avro', overwrite=True) as writer:
  for record in records:
    writer.write(record)

# Read it back.
with AvroReader(client, 'names.avro') as reader:
  schema = reader.schema # The inferred schema.
  content = reader.content # The remote file's HDFS content object.
  assert list(reader) == records # The records match!

 

遍歷hdfs目錄

from hdfs import *
import os
from hdfs.ext.avro import AvroReader, AvroWriter


def main():
    client=Client("http://127.0.0.1:50070")
    path = "/test/tmp_data"
    for root, dir, files in client.walk(path):
        for file in files:
            full_path = os.path.join(root, file)
            print full_path
            with AvroReader(client, full_path) as reader:
                schema = reader.schema # The inferred schema.
                content = reader.content # The remote file's HDFS content object.
                #assert list(reader) == records
                for user in list(reader):
                    print user


main()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM