setName('DocumentPubmed') ->addArgument('start',Argument::OPTIONAL,'your first') ->addArgument('end',Argument::OPTIONAL,'your last') ->setDescription('the DocumentPubmed command'); } protected function execute(Input $input, Output $output) { // 接收指令输出 0001 1001 $start_file = $input->getArgument('start'); $end_file = $input->getArgument('end'); //$startfilePath = './public/pubmed/'.$start_file.'.json'; // 替换为实际的JSON文件路径 for ($i = $start_file; $i <= $end_file; $i++) { $file = '/esdata2/esdatajson/enpubmed/'.str_pad((string)$i, 4, '0', STR_PAD_LEFT).'.json'; //文件数组 $numbers[] = $file; // 使用 str_pad 函数填充数字为 4 位数 } // 遍历每个文件 // 批量写入的阈值 $bulkThreshold = 13000; foreach ($numbers as $file) { // 读取json文件 $jsonContent = file_get_contents($file); $jsonData = json_decode($jsonContent, true); // 分割数据为较小的批次 $batches = array_chunk($jsonData, $bulkThreshold); // 创建连接 $hosts = [['host' => '182.43.12.243','port' => 9200, ],]; $client = ClientBuilder::create() ->setHosts($hosts) ->build(); // 遍历每个批次 foreach ($batches as $key => $batch) { $params = []; $newContent = []; foreach ($batch as $contents) { $newContent['abstract'] = $contents['abstract'] ?? ''; $newContent['album'] = $contents['journal_title'] ?? ''; $newContent['author'] = $contents['author'] ?? ''; $newContent['author_count'] = count(explode(',', $contents['author'])) ?? ''; $newContent['author_list'] = explode(',', $contents['author']) ?? []; $newContent['author_org'] = []; $newContent['author_org_list'] = []; $newContent['doi'] = $contents['doi'] ?? ''; $newContent['keyword'] = implode(',', $contents['keyword']) ?? ''; $newContent['keyword_list'] = $contents['keyword'] ?? ''; $newContent['mark'] = ''; $newContent['maorganizationrk'] = ''; $newContent['organization_count'] = 0; $newContent['organization_parsed'] = []; $newContent['page_content'] = ''; $newContent['pdf_name'] = ''; $newContent['pdf_url'] = ''; $newContent['reference_related_count'] = 0; $newContent['references'] = $contents['reference'] ?? []; $newContent['subject'] = ''; $newContent['subject_list'] = ''; $newContent['title'] = $contents['title'] ?? ''; $newContent['uniq_id'] = ''; $newContent['url'] = ''; $newContent['year'] = $contents['year'] ?? ''; $params['body'][]=array( 'index' => array( '_index' => 'document_pubmed_v20231025', '_type' => '_doc', '_id' => $this->generate_unique_id($contents['doi']??''), ), ); $params['body'][]=$newContent; } // 处理响应结果 try { $result = $client->bulk($params); } catch (\Exception $e) { die( 'exception message ' . $e->getMessage() . PHP_EOL); } } } // 指令输出 $output->writeln('文件开始'.array_shift($numbers).'; 文件结束'.array_pop($numbers)); } //生成uuid function generate_unique_id($prefix = '') { $microtime = microtime(true); $rand_num = mt_rand(); $unique_id = uniqid($prefix, false); $hash = hash('sha256', $microtime . $rand_num . $unique_id); return substr($hash, 0, 32); } }