Hive 是一个基于 Hadoop 的数据仓库工具,它可以将结构化的数据文件映射为数据库表,并提供 SQL 查询功能
CREATE TABLE your_table_name (
column1 data_type,
column2 data_type,
...
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY 'delimiter'
STORED AS file_format;
例如:
CREATE TABLE users (
id INT,
name STRING,
age INT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
LOAD DATA [LOCAL] INPATH 'path/to/your/data' INTO TABLE your_table_name;
例如:
LOAD DATA INPATH '/user/hive/warehouse/users' INTO TABLE users;
SELECT * FROM your_table_name WHERE condition;
例如:
SELECT * FROM users WHERE age >= 18 AND age <= 60;
SELECT * FROM your_table_name WHERE column_name IS NOT NULL;
例如:
SELECT * FROM users WHERE name IS NOT NULL;
SELECT column1, COALESCE(column2, 'default_value') AS column2
FROM your_table_name;
例如:
SELECT id, COALESCE(name, 'Unknown') AS name
FROM users;
SELECT * FROM your_table_name WHERE column_name IS NOT NULL;
例如:
SELECT * FROM users WHERE name IS NOT NULL;
SELECT column1 AS new_column1, column2 AS new_column2
FROM your_table_name;
例如:
SELECT id AS user_id, name AS user_name
FROM users;
SELECT column1, COUNT(*) AS count
FROM your_table_name
GROUP BY column1;
例如:
SELECT age, COUNT(*) AS user_count
FROM users
GROUP BY age;
SELECT * FROM your_table_name
ORDER BY column_name [ASC|DESC];
例如:
SELECT * FROM users
ORDER BY age ASC;
INSERT OVERWRITE TABLE new_table_name SELECT * FROM your_table_name;
例如:
INSERT OVERWRITE TABLE cleaned_users SELECT * FROM users;
或者将结果保存到文件:
SELECT * FROM your_table_name
INTO OUTFILE 'path/to/output/file';
例如:
SELECT * FROM users
INTO OUTFILE '/user/hive/warehouse/cleaned_users';
通过以上步骤,您可以在 Hive 中执行数据清洗操作。请注意,Hive 不支持所有 SQL 功能,因此某些操作可能需要使用其他工具(如 MapReduce 或 Spark)。