怎么在自己做网站,网站安全建设架构,app网站开发定制,两颗米网站建设将excel文件转成txt文件#xff0c;再将txt文件上传hdfs#xff0c;最后传入hive中
注意的点 #xff08;1#xff09;先判断写入的txt文件是否存在#xff0c;如果不存在就需要创建路径 #xff08;2#xff09;如果txt文件已经存在#xff0c;那么先将对应的文件进行…将excel文件转成txt文件再将txt文件上传hdfs最后传入hive中
注意的点 1先判断写入的txt文件是否存在如果不存在就需要创建路径 2如果txt文件已经存在那么先将对应的文件进行删除后再写入txt数据 3excel文件中有可能第一行是字段名需要跳过
版本1 本地版本 1.利用python将excel转成txt文件
from datetime import datetime, timedelta
import os
import pytz
import pandas as pddef excel_to_txt(name, date):# Read Excel file into a DataFramedf pd.read_excel(fdata/excel/{name}.xlsx, headerNone, skiprows1)# Define output directory and pathoutput_directory os.path.join(data, txt, date)os.makedirs(output_directory, exist_okTrue) # Create directory if it doesnt existoutput_path os.path.join(output_directory, f{name}.txt)# Check if the file already exists, if so, remove itif os.path.exists(output_path):os.remove(output_path)print(fExisting file {output_path} removed.)# Write DataFrame to a new text fileprint(开始写入txt文件)df.to_csv(output_path, headerNone, sep\t, indexFalse)print(文件写入成功!)return output_pathif __name__ __main__:current_time datetime.now(pytz.timezone(Asia/Shanghai))one_day_ago (current_time - timedelta(days1)).strftime(%Y-%m-%d)local_file_path excel_to_txt(IS_GS_Recruitment_Data_20231211, one_day_ago)print(local_file_path)
2.上传到hdfs 3.在hive中创建表
drop table if exists ticket.test_text;
create external table IF NOT EXISTS ticket.test_text
(name string,age int
) comment row format delimited fields terminated by \tlines terminated by \nNULL DEFINED AS stored as textfileLOCATION /warehouse/ticket/ods/test_text;4.将hdfs数据写入hive
load data inpath /origin_data/test.txt overwrite into table ticket.test_text;2服务器版本 先把excel_to_txt脚本上传到服务器
excel_to_txt.py
from datetime import datetime, timedelta
import os
import pytz
import pandas as pddef excel_to_txt(name, date):# Read Excel file into a DataFramedf pd.read_excel(f/opt/module/data/excel/{name}.xlsx, headerNone,skiprows1)# df pd.read_excel(fhdfs://mycluster:8020/origin_data/hr_cn/db/is_gs_recruitment_data_full/excel/{name}.xlsx, headerNone,skiprows1)# df pd.read_excel(fdata/excel/{name}.xlsx, headerNone,skiprows1)# Define output directory and pathoutput_directory os.path.join(/opt/module/data, txt, date)os.makedirs(output_directory, exist_okTrue) # Create directory if it doesnt existoutput_path os.path.join(output_directory, f{name}.txt)# Check if the file already exists, if so, remove itif os.path.exists(output_path):os.remove(output_path)print(fExisting file {output_path} removed.)# Write DataFrame to a new text fileprint(开始写入txt文件)df.to_csv(output_path, headerNone, sep\t, indexFalse)print(文件写入成功!)return output_pathif __name__ __main__:current_time datetime.now(pytz.timezone(Asia/Shanghai))one_day_ago (current_time - timedelta(days1)).strftime(%Y-%m-%d)local_file_path excel_to_txt(IS_GS_Recruitment_Data_20231211, one_day_ago)print(local_file_path)2.安装python3环境安装链接 https://editor.csdn.net/md/?articleId129627849
3.执行python脚本
recruitment_excel_to_txt.sh
#!/bin/bash
/opt/module/miniconda3/bin/python /opt/module/data/excel/excel_to_txt.py4.上传到hdfs并将数据导入hive
recruitment_hdfs_to_ods.sh
#!/bin/bash
DATAX_HOME/opt/module/datax# 如果传入日期则do_date等于传入的日期否则等于前一天日期
if [ -n $2 ] ;thendatestr$2
elsedatestr$(date -d -1 day %F)
fi# 处理目标路径检查目标路径是否存在且不为空如果不为空则清空目录内容
handle_target() {content_size$(hadoop fs -count $1 | awk {print $3})if [[ $content_size -ne 0 ]]; thenecho 路径$1不为空正在清空......hadoop fs -rm -r -f $1/*fi
}# 整合处理目标路径和上传文件的逻辑
handle_target_and_put() {handle_target $2echo 上傳文件hadoop fs -put $1 $2
}function import_data(){
# $*: 获取所有参数,如果使用包裹之后,$*当做整体
# $#: 获取参数个数
# $: 获取所有参数,如果使用包裹之后,把每个参数当做单独的个体
# $?: 获取上一个指令的结果tableNames$*sqluse hr_cn;#遍历所有表,拼接每个表的数据加载sql语句for table in $tableNamesdosql${sql}load data inpath /origin_data/hr_cn/db/${table:4}/${datestr}/* overwrite into table ${table} partition (dt$datestr);done#执行sql/opt/module/hive/bin/hive -e $sql
}case $1 in
all)handle_target_and_put /opt/module/data/txt/${datestr}/ /origin_data/hr_cn/db/recruitment_info_full/import_data ods_recruitment_info_full;;
recruitment_info)handle_target_and_put /opt/module/data/txt/${datestr}/ /origin_data/hr_cn/db/recruitment_info_full/import_data ods_recruitment_info_full;;
esac