為了提供可擴展性,Kudu 表被划分為稱為 tablets 的單元,並分布在許多 tablet servers 上。行總是屬於單個 tablet 。將行分配給 tablet 的方法由在表創建期間設置的表的分區決定。 kudu提供了3種分區方式:
Range Partitioning ( 范圍分區 )
范圍分區可以根據存入數據的數據量,均衡的存儲到各個機器上,防止機器出現負載不均衡現象 創建一張表,要求按照如下方式進行分區: create table rangeTable(CompanyId Type.INT32 , WorkId Type.INT32 , Name Type.STRING , Gender Type.STRING , Photo Type.STRING) RANGE (CompanyId) ( PARTITION 0 <= VALUES < 10, PARTITION 10 <= VALUES < 20, PARTITION 20 <= VALUES < 30, PARTITION 30 <= VALUES < 40, PARTITION 40 <= VALUES < 50, PARTITION 50 <= VALUES < 60, PARTITION 60 <= VALUES < 70, PARTITION 70 <= VALUES < 80, PARTITION 80 <= VALUES < 90 )
代碼實現:
public class createRangePartition { private static ColumnSchema newColumn(String column , Type type , boolean isPrimary){ final ColumnSchema.ColumnSchemaBuilder columnSchemaBuilder = new ColumnSchema.ColumnSchemaBuilder(column, type); columnSchemaBuilder.key(isPrimary); return columnSchemaBuilder.build(); } public static void main(String[] args) { //master地址 final String master = "hadoop01,hadoop02,hadoop03"; final KuduClient client = new KuduClient.KuduClientBuilder(master).defaultSocketReadTimeoutMs(6000).build(); // 設置表的schema List<ColumnSchema> columns = new LinkedList<ColumnSchema>(); columns.add(newColumn("CompanyId", Type.INT32, true)); columns.add(newColumn("WorkId", Type.INT32, false)); columns.add(newColumn("Name", Type.STRING, false)); columns.add(newColumn("Gender", Type.STRING, false)); columns.add(newColumn("Photo", Type.STRING, false)); Schema schema = new Schema(columns); //創建表時提供的所有選項 final CreateTableOptions options = new CreateTableOptions(); //設置備份數 options.setNumReplicas(1) ; //設置范圍分區的分區規則 List<String> parcols = new LinkedList<String>(); parcols.add("CompanyId") ; //設置按照哪個字段進行range分區 options.setRangePartitionColumns(parcols); /** * 設置range的分區范圍 * 分區1:0 < value < 10 * 分區2:10 <= value < 20 * 分區3:20 <= value < 30 * ........ * 分區9:80 <= value < 90 * */ int count = 0 ; for(int i=1 ; i< 10 ; i++){ PartialRow lower = schema.newPartialRow(); lower.addInt("CompanyId" , count); PartialRow upper = schema.newPartialRow(); count += 10; upper.addInt("CompanyId" , count); options.addRangePartition(lower , upper); } try { client.createTable("rangeTable" , schema , options); } catch (KuduException e) { e.printStackTrace(); }finally { try { client.close(); } catch (KuduException e) { e.printStackTrace(); } } } }
效果截圖:
Hash Partitioning ( 哈希分區 )
哈希分區通過哈希值將行分配到許多 buckets ( 存儲桶 )之一; 哈希分區是一種有效的策略,當不需要對表進行有序訪問時。哈希分區對於在 tablet 之間隨機散布這些功能是有效的,這有助於減輕熱點和 tablet 大小不均勻。
創建一張表,要求按照如下方式進行分區:
create table rangeTable(CompanyId Type.INT32 , WorkId Type.INT32 , Name Type.STRING , Gender Type.STRING , Photo Type.STRING) HASH (CompanyId) PARTITIONS 6, RANGE (CompanyId) ( PARTITION UNBOUNDED )
代碼實現:
public class createHashPartition { private static ColumnSchema newColumn(String column , Type type , boolean isPrimary){ final ColumnSchema.ColumnSchemaBuilder columnSchemaBuilder = new ColumnSchema.ColumnSchemaBuilder(column, type); columnSchemaBuilder.key(isPrimary); return columnSchemaBuilder.build(); } public static void main(String[] args) { //master地址 final String master = "hadoop01,hadoop02,hadoop03"; final KuduClient client = new KuduClient.KuduClientBuilder(master).defaultSocketReadTimeoutMs(6000).build(); // 設置表的schema List<ColumnSchema> columns = new LinkedList<ColumnSchema>(); columns.add(newColumn("CompanyId", Type.INT32, true)); columns.add(newColumn("WorkId", Type.INT32, false)); columns.add(newColumn("Name", Type.STRING, false)); columns.add(newColumn("Gender", Type.STRING, false)); columns.add(newColumn("Photo", Type.STRING, false)); Schema schema = new Schema(columns); //創建表時提供的所有選項 final CreateTableOptions options = new CreateTableOptions(); //設置備份數 options.setNumReplicas(1) ; //設置范圍分區的分區規則 List<String> parcols = new LinkedList<String>(); parcols.add("CompanyId") ; //設置按照哪個字段進行Hash分區 options.addHashPartitions(parcols , 6); try { client.createTable("hashTable" , schema , options); } catch (KuduException e) { e.printStackTrace(); }finally { try { client.close(); } catch (KuduException e) { e.printStackTrace(); } } } }
HASH (CompanyId) PARTITIONS 10, RANGE (CompanyId) ( PARTITION 0 <= VALUES < 10, PARTITION 10 <= VALUES < 20, PARTITION 20 <= VALUES < 30, PARTITION 30 <= VALUES < 40, PARTITION 40 <= VALUES < 50, PARTITION 50 <= VALUES < 60, PARTITION 60 <= VALUES < 70, PARTITION 70 <= VALUES < 80, PARTITION 80 <= VALUES < 90 )
實現:
public class MultilevelParitition { private static ColumnSchema newColumn(String column , Type type , boolean isPrimary){ final ColumnSchema.ColumnSchemaBuilder columnSchemaBuilder = new ColumnSchema.ColumnSchemaBuilder(column, type); columnSchemaBuilder.key(isPrimary); return columnSchemaBuilder.build(); } public static void main(String[] args) { //master地址 final String master = "hadoop01,hadoop02,hadoop03"; final KuduClient client = new KuduClient.KuduClientBuilder(master).defaultSocketReadTimeoutMs(6000).build(); // 設置表的schema List<ColumnSchema> columns = new LinkedList<ColumnSchema>(); columns.add(newColumn("CompanyId", Type.INT32, true)); columns.add(newColumn("WorkId", Type.INT32, false)); columns.add(newColumn("Name", Type.STRING, false)); columns.add(newColumn("Gender", Type.STRING, false)); columns.add(newColumn("Photo", Type.STRING, false)); Schema schema = new Schema(columns); //創建表時提供的所有選項 final CreateTableOptions options = new CreateTableOptions(); //設置備份數 options.setNumReplicas(1) ; //設置范圍分區的分區規則 List<String> parcols = new LinkedList<String>(); parcols.add("CompanyId") ; //設置按照哪個字段進行range分區 options.addHashPartitions(parcols , 10); options.setRangePartitionColumns(parcols); /** * 設置range的分區范圍 * 分區1:0 < value < 10 * 分區2:10 <= value < 20 * 分區3:20 <= value < 30 * ........ * 分區9:80 <= value < 90 * */ int count = 0 ; for(int i=1 ; i< 10 ; i++){ PartialRow lower = schema.newPartialRow(); lower.addInt("CompanyId" , count); PartialRow upper = schema.newPartialRow(); count += 10; upper.addInt("CompanyId" , count); options.addRangePartition(lower , upper); } try { client.createTable("MultilevelTable" , schema , options); } catch (KuduException e) { e.printStackTrace(); }finally { try { client.close(); } catch (KuduException e) { e.printStackTrace(); } } } }
哈希分區有利於最大限度地提高寫入吞吐量,而范圍分區可避免 tablet 無限增長的問題;hash分區和range分區結合,可以極大提升kudu性能