123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- <?php
- declare (strict_types = 1);
- namespace app\command;
- ini_set("memory_limit", "-1");
- set_time_limit(0);
- use Elastic\Elasticsearch\Exception\ElasticsearchException;
- use think\console\Command;
- use think\console\Input;
- use think\console\input\Argument;
- use think\console\input\Option;
- use think\console\Output;
- use Elasticsearch\ClientBuilder;
- class DocumentPubmed extends Command
- {
- protected function configure()
- {
- // 指令配置
- $this->setName('DocumentPubmed')
- ->addArgument('start',Argument::OPTIONAL,'your first')
- ->addArgument('end',Argument::OPTIONAL,'your last')
- ->setDescription('the DocumentPubmed command');
- }
- protected function execute(Input $input, Output $output)
- {
- // 接收指令输出 0001 1001
- $start_file = $input->getArgument('start');
- $end_file = $input->getArgument('end');
- //$startfilePath = './public/pubmed/'.$start_file.'.json'; // 替换为实际的JSON文件路径
- for ($i = $start_file; $i <= $end_file; $i++) {
- $file = '/esdata2/esdatajson/enpubmed/'.str_pad((string)$i, 4, '0', STR_PAD_LEFT).'.json';
- //文件数组
- $numbers[] = $file; // 使用 str_pad 函数填充数字为 4 位数
- }
- // 遍历每个文件
- // 批量写入的阈值
- $bulkThreshold = 13000;
- foreach ($numbers as $file) {
- // 读取json文件
- $jsonContent = file_get_contents($file);
- $jsonData = json_decode($jsonContent, true);
- // 分割数据为较小的批次
- $batches = array_chunk($jsonData, $bulkThreshold);
- // 创建连接
- $hosts = [['host' => '182.43.12.243','port' => 9200, ],];
- $client = ClientBuilder::create()
- ->setHosts($hosts)
- ->build();
- // 遍历每个批次
- foreach ($batches as $key => $batch) {
- $params = [];
- $newContent = [];
- foreach ($batch as $contents) {
- $newContent['abstract'] = $contents['abstract'] ?? '';
- $newContent['album'] = $contents['journal_title'] ?? '';
- $newContent['author'] = $contents['author'] ?? '';
- $newContent['author_count'] = count(explode(',', $contents['author'])) ?? '';
- $newContent['author_list'] = explode(',', $contents['author']) ?? [];
- $newContent['author_org'] = [];
- $newContent['author_org_list'] = [];
- $newContent['doi'] = $contents['doi'] ?? '';
- $newContent['keyword'] = implode(',', $contents['keyword']) ?? '';
- $newContent['keyword_list'] = $contents['keyword'] ?? '';
- $newContent['mark'] = '';
- $newContent['maorganizationrk'] = '';
- $newContent['organization_count'] = 0;
- $newContent['organization_parsed'] = [];
- $newContent['page_content'] = '';
- $newContent['pdf_name'] = '';
- $newContent['pdf_url'] = '';
- $newContent['reference_related_count'] = 0;
- $newContent['references'] = $contents['reference'] ?? [];
- $newContent['subject'] = '';
- $newContent['subject_list'] = '';
- $newContent['title'] = $contents['title'] ?? '';
- $newContent['uniq_id'] = '';
- $newContent['url'] = '';
- $newContent['year'] = $contents['year'] ?? '';
- $params['body'][]=array(
- 'index' => array(
- '_index' => 'document_pubmed_v20231025',
- '_type' => '_doc',
- '_id' => $this->generate_unique_id($contents['doi']??''),
- ),
- );
- $params['body'][]=$newContent;
- }
- // 处理响应结果
- try {
- $result = $client->bulk($params);
- } catch (\Exception $e) {
- die( 'exception message ' . $e->getMessage() . PHP_EOL);
- }
- }
- }
- // 指令输出
- $output->writeln('文件开始'.array_shift($numbers).'; 文件结束'.array_pop($numbers));
- }
- //生成uuid
- function generate_unique_id($prefix = '') {
- $microtime = microtime(true);
- $rand_num = mt_rand();
- $unique_id = uniqid($prefix, false);
- $hash = hash('sha256', $microtime . $rand_num . $unique_id);
- return substr($hash, 0, 32);
- }
- }
|