fix new hil contract
Showing
1 changed file
with
78 additions
and
3 deletions
... | @@ -753,71 +753,146 @@ class Finder: | ... | @@ -753,71 +753,146 @@ class Finder: |
753 | if re.match('保证人3', text) is not None: | 753 | if re.match('保证人3', text) is not None: |
754 | anchor = [bbox[0], bbox[1]] | 754 | anchor = [bbox[0], bbox[1]] |
755 | 755 | ||
756 | need_bbox_find_keys_bbox = [None, None, None] | ||
756 | if anchor is not None: | 757 | if anchor is not None: |
757 | for block in self.pdf_info[page_num]['blocks']: | 758 | for block in self.pdf_info[page_num]['blocks']: |
759 | if all(need_bbox_find_keys_bbox): | ||
760 | break | ||
758 | if block['type'] != 0: | 761 | if block['type'] != 0: |
759 | continue | 762 | continue |
760 | for line in block['lines']: | 763 | for line in block['lines']: |
764 | if all(need_bbox_find_keys_bbox): | ||
765 | break | ||
761 | for span in line['spans']: | 766 | for span in line['spans']: |
767 | if all(need_bbox_find_keys_bbox): | ||
768 | break | ||
769 | |||
762 | bbox, text = span['bbox'], span['text'] | 770 | bbox, text = span['bbox'], span['text'] |
763 | # 找到角色姓名 | 771 | # 找到角色姓名 |
764 | if re.match(role_key, text) is not None: | 772 | if re.match(role_key, text) is not None: |
765 | words = text.split(':')[-1] | 773 | words = text.split(':')[-1] |
774 | if len(words) == 0: | ||
775 | need_bbox_find_keys_bbox[0] = bbox | ||
776 | else: | ||
766 | name['words'] = words | 777 | name['words'] = words |
767 | name['page'] = page_num | 778 | name['page'] = page_num |
768 | name['position'] = bbox | 779 | name['position'] = bbox |
780 | continue | ||
769 | if role_key == '承租人:': | 781 | if role_key == '承租人:': |
770 | # 找到证件号码且确定位置 | 782 | # 找到证件号码且确定位置 |
771 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | 783 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: |
772 | words = text.split(':')[-1] | 784 | words = text.split(':')[-1] |
785 | if len(words) == 0: | ||
786 | need_bbox_find_keys_bbox[1] = bbox | ||
787 | else: | ||
773 | id_num['words'] = words | 788 | id_num['words'] = words |
774 | id_num['page'] = page_num | 789 | id_num['page'] = page_num |
775 | id_num['position'] = bbox | 790 | id_num['position'] = bbox |
776 | # 找到法人代表且确定位置 | 791 | # 找到法人代表且确定位置 |
777 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | 792 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) < anchor[1]: |
778 | words = text.split(':')[-1] | 793 | words = text.split(':')[-1] |
794 | if len(words) == 0: | ||
795 | need_bbox_find_keys_bbox[2] = bbox | ||
796 | else: | ||
779 | representative['words'] = words | 797 | representative['words'] = words |
780 | representative['page'] = page_num | 798 | representative['page'] = page_num |
781 | representative['position'] = bbox | 799 | representative['position'] = bbox |
782 | if role_key == '保证人1:': | 800 | elif role_key == '保证人1:': |
783 | # 找到证件号码且确定位置 | 801 | # 找到证件号码且确定位置 |
784 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | 802 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: |
785 | words = text.split(':')[-1] | 803 | words = text.split(':')[-1] |
804 | if len(words) == 0: | ||
805 | need_bbox_find_keys_bbox[1] = bbox | ||
806 | else: | ||
786 | id_num['words'] = words | 807 | id_num['words'] = words |
787 | id_num['page'] = page_num | 808 | id_num['page'] = page_num |
788 | id_num['position'] = bbox | 809 | id_num['position'] = bbox |
789 | # 找到法人代表且确定位置 | 810 | # 找到法人代表且确定位置 |
790 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | 811 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) < anchor[0] and np.mean(bbox[1::2]) > anchor[1]: |
791 | words = text.split(':')[-1] | 812 | words = text.split(':')[-1] |
813 | if len(words) == 0: | ||
814 | need_bbox_find_keys_bbox[2] = bbox | ||
815 | else: | ||
792 | representative['words'] = words | 816 | representative['words'] = words |
793 | representative['page'] = page_num | 817 | representative['page'] = page_num |
794 | representative['position'] = bbox | 818 | representative['position'] = bbox |
795 | if role_key == '保证人2:': | 819 | elif role_key == '保证人2:': |
796 | # 找到证件号码且确定位置 | 820 | # 找到证件号码且确定位置 |
797 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | 821 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: |
798 | words = text.split(':')[-1] | 822 | words = text.split(':')[-1] |
823 | if len(words) == 0: | ||
824 | need_bbox_find_keys_bbox[1] = bbox | ||
825 | else: | ||
799 | id_num['words'] = words | 826 | id_num['words'] = words |
800 | id_num['page'] = page_num | 827 | id_num['page'] = page_num |
801 | id_num['position'] = bbox | 828 | id_num['position'] = bbox |
802 | # 找到法人代表且确定位置 | 829 | # 找到法人代表且确定位置 |
803 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: | 830 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) < anchor[1]: |
804 | words = text.split(':')[-1] | 831 | words = text.split(':')[-1] |
832 | if len(words) == 0: | ||
833 | need_bbox_find_keys_bbox[2] = bbox | ||
834 | else: | ||
805 | representative['words'] = words | 835 | representative['words'] = words |
806 | representative['page'] = page_num | 836 | representative['page'] = page_num |
807 | representative['position'] = bbox | 837 | representative['position'] = bbox |
808 | if role_key == '保证人3:': | 838 | elif role_key == '保证人3:': |
809 | # 找到证件号码且确定位置 | 839 | # 找到证件号码且确定位置 |
810 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | 840 | if re.match('证件号码:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: |
811 | words = text.split(':')[-1] | 841 | words = text.split(':')[-1] |
842 | if len(words) == 0: | ||
843 | need_bbox_find_keys_bbox[1] = bbox | ||
844 | else: | ||
812 | id_num['words'] = words | 845 | id_num['words'] = words |
813 | id_num['page'] = page_num | 846 | id_num['page'] = page_num |
814 | id_num['position'] = bbox | 847 | id_num['position'] = bbox |
815 | # 找到法人代表且确定位置 | 848 | # 找到法人代表且确定位置 |
816 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: | 849 | if re.match('法定代表人或授权代表:', text) is not None and np.mean(bbox[::2]) > anchor[0] and np.mean(bbox[1::2]) > anchor[1]: |
817 | words = text.split(':')[-1] | 850 | words = text.split(':')[-1] |
851 | if len(words) == 0: | ||
852 | need_bbox_find_keys_bbox[2] = bbox | ||
853 | else: | ||
818 | representative['words'] = words | 854 | representative['words'] = words |
819 | representative['page'] = page_num | 855 | representative['page'] = page_num |
820 | representative['position'] = bbox | 856 | representative['position'] = bbox |
857 | for idx, bbox in enumerate(need_bbox_find_keys_bbox): | ||
858 | if bbox is None: | ||
859 | continue | ||
860 | is_find = False | ||
861 | if idx == 1: | ||
862 | width_rate = 3 | ||
863 | else: | ||
864 | width_rate = 1 | ||
865 | minx = bbox[2] | ||
866 | maxx = bbox[2] + (width_rate * (bbox[2]-bbox[0])) | ||
867 | miny = bbox[1] | ||
868 | maxy = bbox[3] | ||
869 | for block in self.pdf_info[page_num]['blocks']: | ||
870 | if block['type'] != 0: | ||
871 | continue | ||
872 | if is_find: | ||
873 | break | ||
874 | for line in block['lines']: | ||
875 | if is_find: | ||
876 | break | ||
877 | for span in line['spans']: | ||
878 | if is_find: | ||
879 | break | ||
880 | value_bbox, text = span['bbox'], span['text'] | ||
881 | if minx < np.mean(value_bbox[::2]) < maxx and miny < np.mean(value_bbox[1::2]) < maxy: | ||
882 | if idx == 0: | ||
883 | name['words'] = text | ||
884 | name['page'] = page_num | ||
885 | name['position'] = value_bbox | ||
886 | elif idx == 1: | ||
887 | id_num['words'] = text | ||
888 | id_num['page'] = page_num | ||
889 | id_num['position'] = value_bbox | ||
890 | elif idx == 2: | ||
891 | representative['words'] = text | ||
892 | representative['page'] = page_num | ||
893 | representative['position'] = value_bbox | ||
894 | is_find = True | ||
895 | break | ||
821 | return name, id_num, representative | 896 | return name, id_num, representative |
822 | 897 | ||
823 | def get_table_add_product(self): | 898 | def get_table_add_product(self): | ... | ... |
-
Please register or sign in to post a comment