jiankun
/
GY_zhishiku


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
							<?php
declare (strict_types = 1);
namespace app\command;
ini_set("memory_limit", "-1");
set_time_limit(0);

use Elastic\Elasticsearch\Exception\ElasticsearchException;
use think\console\Command;
use think\console\Input;
use think\console\input\Argument;
use think\console\input\Option;
use think\console\Output;
use Elasticsearch\ClientBuilder;

class DocumentPubmed extends Command
{
    protected function configure()
    {
        // 指令配置
        $this->setName('DocumentPubmed')
            ->addArgument('start',Argument::OPTIONAL,'your first')
            ->addArgument('end',Argument::OPTIONAL,'your last')
            ->setDescription('the DocumentPubmed command');
    }

    protected function execute(Input $input, Output $output)
    {
        // 接收指令输出 0001 1001
        $start_file = $input->getArgument('start');
        $end_file = $input->getArgument('end');

        //$startfilePath = './public/pubmed/'.$start_file.'.json'; // 替换为实际的JSON文件路径
        for ($i = $start_file; $i <= $end_file; $i++) {
            $file = '/esdata2/esdatajson/enpubmed/'.str_pad((string)$i, 4, '0', STR_PAD_LEFT).'.json';
            //文件数组
            $numbers[] = $file; // 使用 str_pad 函数填充数字为 4 位数
        }
        // 遍历每个文件
        // 批量写入的阈值
        $bulkThreshold = 13000;
        foreach ($numbers as $file) {
            // 读取json文件
            $jsonContent = file_get_contents($file);
            $jsonData = json_decode($jsonContent, true);
            // 分割数据为较小的批次
            $batches = array_chunk($jsonData, $bulkThreshold);

            // 创建连接
            $hosts = [['host' => '182.43.12.243','port' => 9200, ],];
            $client = ClientBuilder::create()
                ->setHosts($hosts)
                ->build();

            // 遍历每个批次
            foreach ($batches as $key => $batch) {
                $params = [];
                $newContent = [];
                foreach ($batch as $contents) {
                    $newContent['abstract'] = $contents['abstract'] ?? '';
                    $newContent['album'] = $contents['journal_title'] ?? '';
                    $newContent['author'] = $contents['author'] ?? '';
                    $newContent['author_count'] = count(explode(',', $contents['author'])) ?? '';
                    $newContent['author_list'] = explode(',', $contents['author']) ?? [];
                    $newContent['author_org'] = [];
                    $newContent['author_org_list'] = [];
                    $newContent['doi'] = $contents['doi'] ?? '';
                    $newContent['keyword'] = implode(',', $contents['keyword']) ?? '';
                    $newContent['keyword_list'] = $contents['keyword'] ?? '';
                    $newContent['mark'] = '';
                    $newContent['maorganizationrk'] = '';
                    $newContent['organization_count'] = 0;
                    $newContent['organization_parsed'] = [];
                    $newContent['page_content'] = '';
                    $newContent['pdf_name'] = '';
                    $newContent['pdf_url'] = '';
                    $newContent['reference_related_count'] = 0;
                    $newContent['references'] = $contents['reference'] ?? [];
                    $newContent['subject'] = '';
                    $newContent['subject_list'] = '';
                    $newContent['title'] = $contents['title'] ?? '';
                    $newContent['uniq_id'] = '';
                    $newContent['url'] = '';
                    $newContent['year'] = $contents['year'] ?? '';

                    $params['body'][]=array(
                        'index' => array(
                            '_index' => 'document_pubmed_v20231025',
                            '_type' => '_doc',
                            '_id' => $this->generate_unique_id($contents['doi']??''),
                        ),
                    );

                    $params['body'][]=$newContent;

                }

                // 处理响应结果
                try {
                    $result = $client->bulk($params);
                } catch (\Exception $e) {
                    die( 'exception message ' . $e->getMessage() . PHP_EOL);
                }

            }
        }

        // 指令输出
        $output->writeln('文件开始'.array_shift($numbers).'; 文件结束'.array_pop($numbers));
    }

    //生成uuid
    function generate_unique_id($prefix = '') {
        $microtime = microtime(true);
        $rand_num = mt_rand();
        $unique_id = uniqid($prefix, false);
        $hash = hash('sha256', $microtime . $rand_num . $unique_id);
        return substr($hash, 0, 32);
    }
}