DocumentPubmed.php 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. <?php
  2. declare (strict_types = 1);
  3. namespace app\command;
  4. ini_set("memory_limit", "-1");
  5. set_time_limit(0);
  6. use Elastic\Elasticsearch\Exception\ElasticsearchException;
  7. use think\console\Command;
  8. use think\console\Input;
  9. use think\console\input\Argument;
  10. use think\console\input\Option;
  11. use think\console\Output;
  12. use Elasticsearch\ClientBuilder;
  13. class DocumentPubmed extends Command
  14. {
  15. protected function configure()
  16. {
  17. // 指令配置
  18. $this->setName('DocumentPubmed')
  19. ->addArgument('start',Argument::OPTIONAL,'your first')
  20. ->addArgument('end',Argument::OPTIONAL,'your last')
  21. ->setDescription('the DocumentPubmed command');
  22. }
  23. protected function execute(Input $input, Output $output)
  24. {
  25. // 接收指令输出 0001 1001
  26. $start_file = $input->getArgument('start');
  27. $end_file = $input->getArgument('end');
  28. //$startfilePath = './public/pubmed/'.$start_file.'.json'; // 替换为实际的JSON文件路径
  29. for ($i = $start_file; $i <= $end_file; $i++) {
  30. $file = '/esdata2/esdatajson/enpubmed/'.str_pad((string)$i, 4, '0', STR_PAD_LEFT).'.json';
  31. //文件数组
  32. $numbers[] = $file; // 使用 str_pad 函数填充数字为 4 位数
  33. }
  34. // 遍历每个文件
  35. // 批量写入的阈值
  36. $bulkThreshold = 13000;
  37. foreach ($numbers as $file) {
  38. // 读取json文件
  39. $jsonContent = file_get_contents($file);
  40. $jsonData = json_decode($jsonContent, true);
  41. // 分割数据为较小的批次
  42. $batches = array_chunk($jsonData, $bulkThreshold);
  43. // 创建连接
  44. $hosts = [['host' => '182.43.12.243','port' => 9200, ],];
  45. $client = ClientBuilder::create()
  46. ->setHosts($hosts)
  47. ->build();
  48. // 遍历每个批次
  49. foreach ($batches as $key => $batch) {
  50. $params = [];
  51. $newContent = [];
  52. foreach ($batch as $contents) {
  53. $newContent['abstract'] = $contents['abstract'] ?? '';
  54. $newContent['album'] = $contents['journal_title'] ?? '';
  55. $newContent['author'] = $contents['author'] ?? '';
  56. $newContent['author_count'] = count(explode(',', $contents['author'])) ?? '';
  57. $newContent['author_list'] = explode(',', $contents['author']) ?? [];
  58. $newContent['author_org'] = [];
  59. $newContent['author_org_list'] = [];
  60. $newContent['doi'] = $contents['doi'] ?? '';
  61. $newContent['keyword'] = implode(',', $contents['keyword']) ?? '';
  62. $newContent['keyword_list'] = $contents['keyword'] ?? '';
  63. $newContent['mark'] = '';
  64. $newContent['maorganizationrk'] = '';
  65. $newContent['organization_count'] = 0;
  66. $newContent['organization_parsed'] = [];
  67. $newContent['page_content'] = '';
  68. $newContent['pdf_name'] = '';
  69. $newContent['pdf_url'] = '';
  70. $newContent['reference_related_count'] = 0;
  71. $newContent['references'] = $contents['reference'] ?? [];
  72. $newContent['subject'] = '';
  73. $newContent['subject_list'] = '';
  74. $newContent['title'] = $contents['title'] ?? '';
  75. $newContent['uniq_id'] = '';
  76. $newContent['url'] = '';
  77. $newContent['year'] = $contents['year'] ?? '';
  78. $params['body'][]=array(
  79. 'index' => array(
  80. '_index' => 'document_pubmed_v20231025',
  81. '_type' => '_doc',
  82. '_id' => $this->generate_unique_id($contents['doi']??''),
  83. ),
  84. );
  85. $params['body'][]=$newContent;
  86. }
  87. // 处理响应结果
  88. try {
  89. $result = $client->bulk($params);
  90. } catch (\Exception $e) {
  91. die( 'exception message ' . $e->getMessage() . PHP_EOL);
  92. }
  93. }
  94. }
  95. // 指令输出
  96. $output->writeln('文件开始'.array_shift($numbers).'; 文件结束'.array_pop($numbers));
  97. }
  98. //生成uuid
  99. function generate_unique_id($prefix = '') {
  100. $microtime = microtime(true);
  101. $rand_num = mt_rand();
  102. $unique_id = uniqid($prefix, false);
  103. $hash = hash('sha256', $microtime . $rand_num . $unique_id);
  104. return substr($hash, 0, 32);
  105. }
  106. }